diff --git a/Cargo.toml b/Cargo.toml index bbf2ee5892..e594e59105 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -136,6 +136,7 @@ rand = { version = "0.8.3", features = ["small_rng"] } rangemap = { version = "1.0" } rayon = "1.10" roaring = "0.10.1" +rstest = "0.19.0" rustc_version = "0.4" serde = { version = "^1" } serde_json = { version = "1" } diff --git a/protos/encodings.proto b/protos/encodings.proto index a91efe10e1..09e81c513d 100644 --- a/protos/encodings.proto +++ b/protos/encodings.proto @@ -298,8 +298,13 @@ message MiniBlockLayout { ArrayEncoding value_compression = 3; } +message AllNullLayout { + +} + message PageLayout { oneof layout { MiniBlockLayout mini_block_layout = 1; + AllNullLayout all_null_layout = 2; } } \ No newline at end of file diff --git a/python/python/benchmarks/test_file.py b/python/python/benchmarks/test_file.py new file mode 100644 index 0000000000..483cb7e74d --- /dev/null +++ b/python/python/benchmarks/test_file.py @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors +from pathlib import Path + +import pyarrow as pa +import pytest +from lance.file import LanceFileReader, LanceFileWriter +from lance.tracing import trace_to_chrome + +trace_to_chrome(level="debug", file="/tmp/trace.json") + +NUM_ROWS = 10_000_000 +ROWS_TO_SAMPLE = 10 + + +@pytest.mark.parametrize( + "version", + ["2.0", "2.1"], + ids=["2_0", "2_1"], +) +@pytest.mark.benchmark(group="scan_single_column") +def test_scan_integer(tmp_path: Path, benchmark, version): + schema = pa.schema([pa.field("values", pa.uint64(), True)]) + + def gen_data(): + remaining = NUM_ROWS + offset = 0 + while remaining > 0: + to_take = min(remaining, 10000) + values = pa.array(range(offset, offset + to_take)) + batch = pa.table({"values": values}).to_batches()[0] + yield batch + remaining -= to_take + offset += to_take + + with LanceFileWriter( + str(tmp_path / "file.lance"), schema, version=version + ) as writer: + for batch in gen_data(): + writer.write_batch(batch) + + def read_all(): + reader = LanceFileReader(str(tmp_path / "file.lance")) + return reader.read_all(batch_size=16 * 1024).to_table() + + result = benchmark.pedantic(read_all, rounds=1, iterations=1) + + assert result.num_rows == NUM_ROWS + + +@pytest.mark.parametrize( + "version", + ["2.0", "2.1"], + ids=["2_0", "2_1"], +) +@pytest.mark.benchmark(group="scan_single_column") +def test_scan_nullable_integer(tmp_path: Path, benchmark, version): + schema = pa.schema([pa.field("values", pa.uint64(), True)]) + + def gen_data(): + remaining = NUM_ROWS + offset = 0 + while remaining > 0: + to_take = min(remaining, 10000) + values = pa.array( + [None if i % 2 == 0 else i for i in range(offset, offset + to_take)] + ) + batch = pa.table({"values": values}).to_batches()[0] + yield batch + remaining -= to_take + offset += to_take + + with LanceFileWriter( + str(tmp_path / "file.lance"), schema, version=version + ) as writer: + for batch in gen_data(): + writer.write_batch(batch) + + def read_all(): + reader = LanceFileReader(str(tmp_path / "file.lance")) + return reader.read_all(batch_size=16 * 1024).to_table() + + result = benchmark.pedantic(read_all, rounds=1, iterations=1) + + assert result.num_rows == NUM_ROWS + + +@pytest.mark.benchmark(group="scan_single_column") +def test_scan_nested_integer(tmp_path: Path, benchmark): + def get_val(i: int): + if i % 4 == 0: + return None + elif i % 4 == 1: + return {"outer": None} + elif i % 4 == 2: + return {"outer": {"inner": None}} + else: + return {"outer": {"inner": i}} + + dtype = pa.struct( + [pa.field("outer", pa.struct([pa.field("inner", pa.uint64(), True)]), True)] + ) + schema = pa.schema( + [ + pa.field( + "values", + dtype, + True, + ) + ] + ) + + def gen_data(): + remaining = NUM_ROWS + offset = 0 + while remaining > 0: + to_take = min(remaining, 10000) + values = pa.array([get_val(i) for i in range(offset, offset + to_take)]) + batch = pa.table({"values": values}).to_batches()[0] + yield batch + remaining -= to_take + offset += to_take + + with LanceFileWriter(str(tmp_path / "file.lance"), schema, version="2.1") as writer: + for batch in gen_data(): + writer.write_batch(batch) + + def read_all(): + reader = LanceFileReader(str(tmp_path / "file.lance")) + return reader.read_all(batch_size=16 * 1024).to_table() + + result = benchmark.pedantic(read_all, rounds=1, iterations=1) + + assert result.num_rows == NUM_ROWS + + +@pytest.mark.parametrize( + "version", + ["2.0", "2.1"], + ids=["2_0", "2_1"], +) +@pytest.mark.benchmark(group="sample_single_column") +def test_sample_integer(tmp_path: Path, benchmark, version): + schema = pa.schema([pa.field("values", pa.uint64(), True)]) + + def gen_data(): + remaining = NUM_ROWS + offset = 0 + while remaining > 0: + to_take = min(remaining, 10000) + values = pa.array(range(offset, offset + to_take)) + batch = pa.table({"values": values}).to_batches()[0] + yield batch + remaining -= to_take + offset += to_take + + with LanceFileWriter( + str(tmp_path / "file.lance"), schema, version=version + ) as writer: + for batch in gen_data(): + writer.write_batch(batch) + + reader = LanceFileReader(str(tmp_path / "file.lance")) + indices = list(range(0, NUM_ROWS, NUM_ROWS // ROWS_TO_SAMPLE)) + + def sample(): + return reader.take_rows(indices).to_table() + + result = benchmark.pedantic(sample, rounds=30, iterations=1) + + assert result.num_rows == NUM_ROWS + + +@pytest.mark.benchmark(group="sample_single_column") +def test_sample_nested_integer(tmp_path: Path, benchmark): + def get_val(i: int): + if i % 4 == 0: + return None + elif i % 4 == 1: + return {"outer": None} + elif i % 4 == 2: + return {"outer": {"inner": None}} + else: + return {"outer": {"inner": i}} + + dtype = pa.struct( + [pa.field("outer", pa.struct([pa.field("inner", pa.uint64(), True)]), True)] + ) + schema = pa.schema( + [ + pa.field( + "values", + dtype, + True, + ) + ] + ) + + def gen_data(): + remaining = NUM_ROWS + offset = 0 + while remaining > 0: + to_take = min(remaining, 10000) + values = pa.array([get_val(i) for i in range(offset, offset + to_take)]) + batch = pa.table({"values": values}).to_batches()[0] + yield batch + remaining -= to_take + offset += to_take + + with LanceFileWriter(str(tmp_path / "file.lance"), schema, version="2.1") as writer: + for batch in gen_data(): + writer.write_batch(batch) + + reader = LanceFileReader(str(tmp_path / "file.lance")) + indices = list(range(0, NUM_ROWS, NUM_ROWS // ROWS_TO_SAMPLE)) + + def sample(): + return reader.take_rows(indices).to_table() + + result = benchmark.pedantic(sample, rounds=30, iterations=1) + + assert result.num_rows == NUM_ROWS diff --git a/python/python/tests/test_file.py b/python/python/tests/test_file.py index 6206d34fa2..275834d4c9 100644 --- a/python/python/tests/test_file.py +++ b/python/python/tests/test_file.py @@ -74,15 +74,13 @@ def test_version(tmp_path): assert metadata.major_version == 0 assert metadata.minor_version == 3 - # TODO: Temporarily disabled until read path for 2.1 is added - # - # path = tmp_path / "foo2.lance" - # with LanceFileWriter(str(path), schema, version="2.1") as writer: - # writer.write_batch(pa.table({"a": [1, 2, 3]})) - # reader = LanceFileReader(str(path)) - # metadata = reader.metadata() - # assert metadata.major_version == 2 - # assert metadata.minor_version == 1 + path = tmp_path / "foo2.lance" + with LanceFileWriter(str(path), schema, version="2.1") as writer: + writer.write_batch(pa.table({"a": [1, 2, 3]})) + reader = LanceFileReader(str(path)) + metadata = reader.metadata() + assert metadata.major_version == 2 + assert metadata.minor_version == 1 def test_take(tmp_path): diff --git a/python/src/file.rs b/python/src/file.rs index 9d14d13ff0..e6e3d237d1 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -20,10 +20,10 @@ use bytes::Bytes; use futures::stream::StreamExt; use lance::io::{ObjectStore, RecordBatchStream}; use lance_core::cache::FileMetadataCache; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::{ v2::{ - reader::{BufferDescriptor, CachedFileMetadata, FileReader}, + reader::{BufferDescriptor, CachedFileMetadata, FileReader, FileReaderOptions}, writer::{FileWriter, FileWriterOptions}, }, version::LanceFileVersion, @@ -335,8 +335,9 @@ impl LanceFileReader { let inner = FileReader::try_open( file, None, - Arc::::default(), + Arc::::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await .infer_error()?; diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 6acb9ed936..9dd0da6e26 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -36,6 +36,7 @@ lazy_static::lazy_static! { ]); pub static ref BLOB_DESC_FIELD: ArrowField = ArrowField::new("description", DataType::Struct(BLOB_DESC_FIELDS.clone()), false); + pub static ref BLOB_DESC_LANCE_FIELD: Field = Field::try_from(&*BLOB_DESC_FIELD).unwrap(); } /// LogicalType is a string presentation of arrow type. diff --git a/rust/lance-encoding-datafusion/src/lib.rs b/rust/lance-encoding-datafusion/src/lib.rs index 6810721c29..807e7a4bd1 100644 --- a/rust/lance-encoding-datafusion/src/lib.rs +++ b/rust/lance-encoding-datafusion/src/lib.rs @@ -7,18 +7,12 @@ use std::{ }; use arrow_schema::DataType; -use lance_core::{ - datatypes::{Field, Schema}, - Result, +use lance_core::datatypes::{Field, Schema}; +use lance_encoding::encoder::{ + default_encoding_strategy, ColumnIndexSequence, EncodingOptions, FieldEncodingStrategy, }; -use lance_encoding::{ - decoder::{ColumnInfoIter, DecoderMiddlewareChainCursor, FieldDecoderStrategy, FieldScheduler}, - encoder::{ - ColumnIndexSequence, CoreFieldEncodingStrategy, EncodingOptions, FieldEncodingStrategy, - }, - encodings::physical::FileBuffers, -}; -use zone::{extract_zone_info, UnloadedPushdown, ZoneMapsFieldEncoder, ZoneMapsFieldScheduler}; +use lance_file::version::LanceFileVersion; +use zone::{UnloadedPushdown, ZoneMapsFieldEncoder}; pub mod format; pub mod substrait; @@ -27,9 +21,11 @@ pub mod zone; #[derive(Debug)] struct LanceDfFieldDecoderState { /// We assume that all columns have the same number of rows per map + #[allow(unused)] rows_per_map: Option, /// As we visit the decoding tree we populate this with the pushdown /// information that is available. + #[allow(unused)] zone_map_buffers: HashMap, } @@ -46,7 +42,9 @@ struct LanceDfFieldDecoderState { /// we aren't technically doing any concurrency. #[derive(Debug)] pub struct LanceDfFieldDecoderStrategy { + #[allow(unused)] state: Arc>>, + #[allow(unused)] schema: Arc, } @@ -58,6 +56,7 @@ impl LanceDfFieldDecoderStrategy { } } + #[allow(unused)] fn initialize(&self) -> bool { let mut state = self.state.lock().unwrap(); if state.is_none() { @@ -71,6 +70,7 @@ impl LanceDfFieldDecoderStrategy { } } + #[allow(unused)] fn add_pushdown_field( &self, field: &Field, @@ -93,69 +93,70 @@ impl LanceDfFieldDecoderStrategy { } } -impl FieldDecoderStrategy for LanceDfFieldDecoderStrategy { - fn create_field_scheduler<'a>( - &self, - field: &Field, - column_infos: &mut ColumnInfoIter, - buffers: FileBuffers, - chain: DecoderMiddlewareChainCursor<'a>, - ) -> Result<( - DecoderMiddlewareChainCursor<'a>, - Result>, - )> { - let is_root = self.initialize(); +// TODO: Reconnect...again +// impl FieldDecoderStrategy for LanceDfFieldDecoderStrategy { +// fn create_field_scheduler<'a>( +// &self, +// field: &Field, +// column_infos: &mut ColumnInfoIter, +// buffers: FileBuffers, +// chain: DecoderMiddlewareChainCursor<'a>, +// ) -> Result<( +// DecoderMiddlewareChainCursor<'a>, +// Result>, +// )> { +// let is_root = self.initialize(); - if let Some((rows_per_map, unloaded_pushdown)) = - extract_zone_info(column_infos, &field.data_type(), chain.current_path()) - { - // If there is pushdown info then record it and unwrap the - // pushdown encoding layer. - self.add_pushdown_field(field, rows_per_map, unloaded_pushdown); - } - // Delegate to the rest of the chain to create the decoder - let (chain, next) = chain.next(field, column_infos, buffers)?; +// if let Some((rows_per_map, unloaded_pushdown)) = +// extract_zone_info(column_infos, &field.data_type(), chain.current_path()) +// { +// // If there is pushdown info then record it and unwrap the +// // pushdown encoding layer. +// self.add_pushdown_field(field, rows_per_map, unloaded_pushdown); +// } +// // Delegate to the rest of the chain to create the decoder +// let (chain, next) = chain.next(field, column_infos, buffers)?; - // If this is the top level decoder then wrap it with our - // pushdown filtering scheduler. - if is_root { - let state = self.state.lock().unwrap().take().unwrap(); - let schema = self.schema.clone(); - let rows_per_map = state.rows_per_map; - let zone_map_buffers = state.zone_map_buffers; - let next = next?; - let num_rows = next.num_rows(); - if rows_per_map.is_none() { - // No columns had any pushdown info - Ok((chain, Ok(next))) - } else { - let scheduler = ZoneMapsFieldScheduler::new( - next, - schema, - zone_map_buffers, - rows_per_map.unwrap(), - num_rows, - ); - Ok((chain, Ok(Arc::new(scheduler)))) - } - } else { - Ok((chain, next)) - } - } -} +// // If this is the top level decoder then wrap it with our +// // pushdown filtering scheduler. +// if is_root { +// let state = self.state.lock().unwrap().take().unwrap(); +// let schema = self.schema.clone(); +// let rows_per_map = state.rows_per_map; +// let zone_map_buffers = state.zone_map_buffers; +// let next = next?; +// let num_rows = next.num_rows(); +// if rows_per_map.is_none() { +// // No columns had any pushdown info +// Ok((chain, Ok(next))) +// } else { +// let scheduler = ZoneMapsFieldScheduler::new( +// next.into(), +// schema, +// zone_map_buffers, +// rows_per_map.unwrap(), +// num_rows, +// ); +// Ok((chain, Ok(Box::new(scheduler)))) +// } +// } else { +// Ok((chain, next)) +// } +// } +// } /// Wraps the core encoding strategy and adds the encoders from this /// crate #[derive(Debug)] pub struct LanceDfFieldEncodingStrategy { - core: CoreFieldEncodingStrategy, + inner: Box, rows_per_map: u32, } impl Default for LanceDfFieldEncodingStrategy { fn default() -> Self { Self { - core: CoreFieldEncodingStrategy::default(), + inner: default_encoding_strategy(LanceFileVersion::default()), rows_per_map: 10000, } } @@ -176,9 +177,9 @@ impl FieldEncodingStrategy for LanceDfFieldEncodingStrategy { DataType::Boolean | DataType::Utf8 | DataType::LargeUtf8 ) { - let inner_encoder = self.core.create_field_encoder( + let inner_encoder = self.inner.create_field_encoder( // Don't collect stats on inner string fields - &self.core, + self.inner.as_ref(), field, column_index, options, @@ -189,7 +190,7 @@ impl FieldEncodingStrategy for LanceDfFieldEncodingStrategy { self.rows_per_map, )?)) } else { - self.core + self.inner .create_field_encoder(encoding_strategy_root, field, column_index, options) } } diff --git a/rust/lance-encoding-datafusion/src/zone.rs b/rust/lance-encoding-datafusion/src/zone.rs index 325fa3fd7f..d24005acaa 100644 --- a/rust/lance-encoding-datafusion/src/zone.rs +++ b/rust/lance-encoding-datafusion/src/zone.rs @@ -26,7 +26,7 @@ use lance_datafusion::planner::Planner; use lance_encoding::{ buffer::LanceBuffer, decoder::{ - decode_batch, ColumnInfoIter, DecoderMiddlewareChain, FieldScheduler, FilterExpression, + decode_batch, ColumnInfoIter, DecoderPlugins, FieldScheduler, FilterExpression, PriorityRange, ScheduledScanLine, SchedulerContext, SchedulingJob, }, encoder::{ @@ -39,7 +39,10 @@ use lance_encoding::{ }; use lance_core::{cache::FileMetadataCache, datatypes::Schema, Error, Result}; -use lance_file::v2::{reader::EncodedBatchReaderExt, writer::EncodedBatchWriteExt}; +use lance_file::{ + v2::{reader::EncodedBatchReaderExt, writer::EncodedBatchWriteExt}, + version::LanceFileVersion, +}; use snafu::{location, Location}; use crate::substrait::FilterExpressionExt; @@ -128,6 +131,7 @@ fn path_to_expr(path: &VecDeque) -> Expr { } /// If a column has zone info in the encoding description then extract it +#[allow(unused)] pub(crate) fn extract_zone_info( column_info: &mut ColumnInfoIter, data_type: &DataType, @@ -382,11 +386,13 @@ impl ZoneMapsFieldScheduler { ArrowField::new("null_count", DataType::UInt32, false), ])) .unwrap(); - let zone_maps_batch = EncodedBatch::try_from_mini_lance(buffer, &zone_map_schema)?; + let zone_maps_batch = + EncodedBatch::try_from_mini_lance(buffer, &zone_map_schema, LanceFileVersion::V2_0)?; let zone_maps_batch = decode_batch( &zone_maps_batch, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + /*should_validate= */ false, ) .await?; @@ -678,17 +684,13 @@ mod tests { use datafusion_common::ScalarValue; use datafusion_expr::{col, BinaryExpr, Expr, Operator}; use lance_datagen::{BatchCount, RowCount}; - use lance_encoding::decoder::{ - CoreFieldDecoderStrategy, DecoderMiddlewareChain, FilterExpression, - }; + use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::v2::{ testing::{count_lance_file, write_lance_file, FsFixture}, writer::FileWriterOptions, }; - use crate::{ - substrait::FilterExpressionExt, LanceDfFieldDecoderStrategy, LanceDfFieldEncodingStrategy, - }; + use crate::{substrait::FilterExpressionExt, LanceDfFieldEncodingStrategy}; #[test_log::test(tokio::test)] async fn test_basic_stats() { @@ -705,13 +707,7 @@ mod tests { let written_file = write_lance_file(data, &fs, options).await; - let decoder_middleware = Arc::new( - DecoderMiddlewareChain::new() - .add_strategy(Arc::new(LanceDfFieldDecoderStrategy::new( - written_file.schema.clone(), - ))) - .add_strategy(Arc::new(CoreFieldDecoderStrategy::default())), - ); + let decoder_middleware: Arc = Arc::default(); let num_rows = written_file .data diff --git a/rust/lance-encoding/Cargo.toml b/rust/lance-encoding/Cargo.toml index 3c923e3797..4cc94ae4be 100644 --- a/rust/lance-encoding/Cargo.toml +++ b/rust/lance-encoding/Cargo.toml @@ -47,6 +47,7 @@ seq-macro = "0.3.5" lance-testing.workspace = true lance-datagen.workspace = true rand.workspace = true +rstest.workspace = true tempfile.workspace = true test-log.workspace = true criterion = { workspace = true } diff --git a/rust/lance-encoding/benches/decoder.rs b/rust/lance-encoding/benches/decoder.rs index 8b6ddac7dd..9062c9ba03 100644 --- a/rust/lance-encoding/benches/decoder.rs +++ b/rust/lance-encoding/benches/decoder.rs @@ -7,8 +7,9 @@ use arrow_schema::{DataType, Field, Schema, TimeUnit}; use arrow_select::take::take; use criterion::{criterion_group, criterion_main, Criterion}; use lance_encoding::{ - decoder::{DecoderMiddlewareChain, FilterExpression}, - encoder::{encode_batch, CoreFieldEncodingStrategy, EncodingOptions}, + decoder::{DecoderPlugins, FilterExpression}, + encoder::{default_encoding_strategy, encode_batch, EncodingOptions}, + version::LanceFileVersion, }; use rand::Rng; @@ -72,12 +73,12 @@ fn bench_decode(c: &mut Criterion) { Arc::new(lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap()); let input_bytes = data.get_array_memory_size(); group.throughput(criterion::Throughput::Bytes(input_bytes as u64)); - let encoding_strategy = CoreFieldEncodingStrategy::default(); + let encoding_strategy = default_encoding_strategy(LanceFileVersion::default()); let encoded = rt .block_on(encode_batch( &data, lance_schema, - &encoding_strategy, + encoding_strategy.as_ref(), &ENCODING_OPTIONS, )) .unwrap(); @@ -88,7 +89,8 @@ fn bench_decode(c: &mut Criterion) { .block_on(lance_encoding::decoder::decode_batch( &encoded, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, )) .unwrap(); assert_eq!(data.num_rows(), batch.num_rows()); @@ -112,12 +114,12 @@ fn bench_decode_fsl(c: &mut Criterion) { Arc::new(lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap()); let input_bytes = data.get_array_memory_size(); group.throughput(criterion::Throughput::Bytes(input_bytes as u64)); - let encoding_strategy = CoreFieldEncodingStrategy::default(); + let encoding_strategy = default_encoding_strategy(LanceFileVersion::default()); let encoded = rt .block_on(encode_batch( &data, lance_schema, - &encoding_strategy, + encoding_strategy.as_ref(), &ENCODING_OPTIONS, )) .unwrap(); @@ -128,7 +130,8 @@ fn bench_decode_fsl(c: &mut Criterion) { .block_on(lance_encoding::decoder::decode_batch( &encoded, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, )) .unwrap(); assert_eq!(data.num_rows(), batch.num_rows()); @@ -168,12 +171,12 @@ fn bench_decode_str_with_dict_encoding(c: &mut Criterion) { Arc::new(lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap()); let input_bytes = data.get_array_memory_size(); group.throughput(criterion::Throughput::Bytes(input_bytes as u64)); - let encoding_strategy = CoreFieldEncodingStrategy::default(); + let encoding_strategy = default_encoding_strategy(LanceFileVersion::default()); let encoded = rt .block_on(encode_batch( &data, lance_schema, - &encoding_strategy, + encoding_strategy.as_ref(), &ENCODING_OPTIONS, )) .unwrap(); @@ -184,7 +187,8 @@ fn bench_decode_str_with_dict_encoding(c: &mut Criterion) { .block_on(lance_encoding::decoder::decode_batch( &encoded, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, )) .unwrap(); assert_eq!(data.num_rows(), batch.num_rows()); @@ -236,12 +240,12 @@ fn bench_decode_packed_struct(c: &mut Criterion) { let lance_schema = Arc::new(lance_core::datatypes::Schema::try_from(&new_schema).unwrap()); let input_bytes = data.get_array_memory_size(); group.throughput(criterion::Throughput::Bytes(input_bytes as u64)); - let encoding_strategy = CoreFieldEncodingStrategy::default(); + let encoding_strategy = default_encoding_strategy(LanceFileVersion::default()); let encoded = rt .block_on(encode_batch( &data, lance_schema, - &encoding_strategy, + encoding_strategy.as_ref(), &ENCODING_OPTIONS, )) .unwrap(); @@ -253,7 +257,8 @@ fn bench_decode_packed_struct(c: &mut Criterion) { .block_on(lance_encoding::decoder::decode_batch( &encoded, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, )) .unwrap(); assert_eq!(data.num_rows(), batch.num_rows()); @@ -285,12 +290,12 @@ fn bench_decode_str_with_fixed_size_binary_encoding(c: &mut Criterion) { Arc::new(lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap()); let input_bytes = data.get_array_memory_size(); group.throughput(criterion::Throughput::Bytes(input_bytes as u64)); - let encoding_strategy = CoreFieldEncodingStrategy::default(); + let encoding_strategy = default_encoding_strategy(LanceFileVersion::default()); let encoded = rt .block_on(encode_batch( &data, lance_schema, - &encoding_strategy, + encoding_strategy.as_ref(), &ENCODING_OPTIONS, )) .unwrap(); @@ -301,7 +306,8 @@ fn bench_decode_str_with_fixed_size_binary_encoding(c: &mut Criterion) { .block_on(lance_encoding::decoder::decode_batch( &encoded, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, )) .unwrap(); assert_eq!(data.num_rows(), batch.num_rows()); diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index a6baf9d6c4..c5b432d690 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -290,6 +290,44 @@ impl FixedWidthDataBlock { } } +pub struct FixedWidthDataBlockBuilder { + bits_per_value: u64, + bytes_per_value: u64, + values: Vec, +} + +impl FixedWidthDataBlockBuilder { + fn new(bits_per_value: u64, estimated_size_bytes: u64) -> Self { + assert!(bits_per_value % 8 == 0); + Self { + bits_per_value, + bytes_per_value: bits_per_value / 8, + values: Vec::with_capacity(estimated_size_bytes as usize), + } + } +} + +impl DataBlockBuilderImpl for FixedWidthDataBlockBuilder { + fn append(&mut self, data_block: &DataBlock, selection: Range) { + let block = data_block.as_fixed_width_ref().unwrap(); + assert_eq!(self.bits_per_value, block.bits_per_value); + let start = selection.start as usize * self.bytes_per_value as usize; + let end = selection.end as usize * self.bytes_per_value as usize; + self.values.extend_from_slice(&block.data[start..end]); + } + + fn finish(self: Box) -> DataBlock { + let num_values = (self.values.len() / self.bytes_per_value as usize) as u64; + DataBlock::FixedWidth(FixedWidthDataBlock { + data: LanceBuffer::Owned(self.values), + bits_per_value: self.bits_per_value, + num_values, + block_info: BlockInfo::new(), + used_encoding: UsedEncoding::new(), + }) + } +} + /// A data block to represent a fixed size list #[derive(Debug)] pub struct FixedSizeListBlock { @@ -805,6 +843,16 @@ impl DataBlock { Self::Opaque(inner) => Self::Opaque(inner), } } + + pub fn make_builder(&self, estimated_size_bytes: u64) -> Box { + match self { + Self::FixedWidth(inner) => Box::new(FixedWidthDataBlockBuilder::new( + inner.bits_per_value, + estimated_size_bytes, + )), + _ => todo!(), + } + } } macro_rules! as_type { @@ -818,6 +866,17 @@ macro_rules! as_type { }; } +macro_rules! as_type_ref { + ($fn_name:ident, $inner:tt, $inner_type:ident) => { + pub fn $fn_name(&self) -> Option<&$inner_type> { + match self { + Self::$inner(inner) => Some(inner), + _ => None, + } + } + }; +} + // Cast implementations impl DataBlock { as_type!(as_all_null, AllNull, AllNullDataBlock); @@ -827,6 +886,13 @@ impl DataBlock { as_type!(as_variable_width, VariableWidth, VariableWidthBlock); as_type!(as_struct, Struct, StructDataBlock); as_type!(as_dictionary, Dictionary, DictionaryDataBlock); + as_type_ref!(as_all_null_ref, AllNull, AllNullDataBlock); + as_type_ref!(as_nullable_ref, Nullable, NullableDataBlock); + as_type_ref!(as_fixed_width_ref, FixedWidth, FixedWidthDataBlock); + as_type_ref!(as_fixed_size_list_ref, FixedSizeList, FixedSizeListBlock); + as_type_ref!(as_variable_width_ref, VariableWidth, VariableWidthBlock); + as_type_ref!(as_struct_ref, Struct, StructDataBlock); + as_type_ref!(as_dictionary_ref, Dictionary, DictionaryDataBlock); } // Methods to convert from Arrow -> DataBlock @@ -1284,6 +1350,41 @@ impl From for DataBlock { } } +pub trait DataBlockBuilderImpl { + fn append(&mut self, data_block: &DataBlock, selection: Range); + fn finish(self: Box) -> DataBlock; +} + +pub struct DataBlockBuilder { + estimated_size_bytes: u64, + builder: Option>, +} + +impl DataBlockBuilder { + pub fn with_capacity_estimate(estimated_size_bytes: u64) -> Self { + Self { + estimated_size_bytes, + builder: None, + } + } + + fn get_builder(&mut self, block: &DataBlock) -> &mut dyn DataBlockBuilderImpl { + if self.builder.is_none() { + self.builder = Some(block.make_builder(self.estimated_size_bytes)); + } + self.builder.as_mut().unwrap().as_mut() + } + + pub fn append(&mut self, data_block: &DataBlock, selection: Range) { + self.get_builder(data_block).append(data_block, selection); + } + + pub fn finish(self) -> DataBlock { + let builder = self.builder.expect("DataBlockBuilder didn't see any data"); + builder.finish() + } +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index cefab2cb9b..8e26da440c 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -225,7 +225,7 @@ use futures::stream::{self, BoxStream}; use futures::{FutureExt, StreamExt}; use lance_arrow::DataTypeExt; use lance_core::cache::{CapacityMode, FileMetadataCache}; -use lance_core::datatypes::{Field, Schema, BLOB_DESC_FIELD}; +use lance_core::datatypes::{Field, Schema, BLOB_DESC_LANCE_FIELD}; use log::{debug, trace, warn}; use snafu::{location, Location}; use tokio::sync::mpsc::error::SendError; @@ -234,15 +234,22 @@ use tokio::sync::mpsc::{self, unbounded_channel}; use lance_core::{Error, Result}; use tracing::instrument; +use crate::buffer::LanceBuffer; use crate::data::DataBlock; use crate::encoder::{values_column_encoding, EncodedBatch}; use crate::encodings::logical::binary::BinaryFieldScheduler; use crate::encodings::logical::blob::BlobFieldScheduler; use crate::encodings::logical::list::{ListFieldScheduler, OffsetPageInfo}; -use crate::encodings::logical::primitive::PrimitiveFieldScheduler; -use crate::encodings::logical::r#struct::{SimpleStructDecoder, SimpleStructScheduler}; +use crate::encodings::logical::primitive::{ + PrimitiveFieldScheduler, StructuralPrimitiveFieldScheduler, +}; +use crate::encodings::logical::r#struct::{ + SimpleStructDecoder, SimpleStructScheduler, StructuralStructDecoder, StructuralStructScheduler, +}; +use crate::encodings::physical::value::{ConstantDecompressor, ValueDecompressor}; use crate::encodings::physical::{ColumnBuffers, FileBuffers}; use crate::format::pb::{self, column_encoding}; +use crate::repdef::{LevelBuffer, RepDefUnraveler}; use crate::{BufferScheduler, EncodingsIo}; // If users are getting batches over 10MiB large then it's time to reduce the batch size @@ -336,6 +343,27 @@ impl ColumnInfo { } } +enum RootScheduler { + Structural(Box), + Legacy(Arc), +} + +impl RootScheduler { + fn as_legacy(&self) -> &Arc { + match self { + Self::Structural(_) => panic!("Expected a legacy scheduler"), + Self::Legacy(s) => s, + } + } + + fn as_structural(&self) -> &dyn StructuralFieldScheduler { + match self { + Self::Structural(s) => s.as_ref(), + Self::Legacy(_) => panic!("Expected a structural scheduler"), + } + } +} + /// The scheduler for decoding batches /// /// Lance decoding is done in two steps, scheduling, and decoding. The @@ -358,159 +386,11 @@ impl ColumnInfo { /// /// TODO: Implement backpressure pub struct DecodeBatchScheduler { - pub root_scheduler: Arc, + root_scheduler: RootScheduler, pub root_fields: Fields, cache: Arc, } -/// Represents a series of decoder strategies -/// -/// These strategies will be applied, in order, to determine -/// which decoder to use for a field. -#[derive(Debug, Clone)] -pub struct DecoderMiddlewareChain { - chain: Vec>, -} - -impl Default for DecoderMiddlewareChain { - fn default() -> Self { - Self { - chain: Default::default(), - } - .add_strategy(Arc::new(CoreFieldDecoderStrategy::default())) - } -} - -impl DecoderMiddlewareChain { - /// Creates an empty decoder chain - pub fn new() -> Self { - Self { chain: Vec::new() } - } - - /// Adds a decoder to the end of the chain - pub fn add_strategy(mut self, decoder: Arc) -> Self { - self.chain.push(decoder); - self - } - - /// Obtain a cursor into the chain that can be used to create - /// field schedulers - pub(crate) fn cursor(&self, io: Arc) -> DecoderMiddlewareChainCursor<'_> { - DecoderMiddlewareChainCursor { - chain: self, - io, - cur_idx: 0, - path: VecDeque::new(), - } - } -} - -/// A cursor into a decoder middleware chain -/// -/// Each field scheduler is given a cursor during the create_field_scheduler -/// call. This cursor can be used both to create child field schedulers and -/// to create a scheduler from an inner encoding. -pub struct DecoderMiddlewareChainCursor<'a> { - chain: &'a DecoderMiddlewareChain, - io: Arc, - path: VecDeque, - cur_idx: usize, -} - -pub type ChosenFieldScheduler<'a> = ( - DecoderMiddlewareChainCursor<'a>, - Result>, -); - -impl<'a> DecoderMiddlewareChainCursor<'a> { - /// Returns the current path into the field being decoded - pub fn current_path(&self) -> &VecDeque { - &self.path - } - - /// Returns the I/O service which can be used to grab column metadata - pub fn io(&self) -> &Arc { - &self.io - } - - /// Delegates responsibilty to the next encoder in the chain - /// - /// Field schedulers should call this method when: - /// - /// * They do not understand or handle the encoding - /// * They wrap an encoding and want a scheduler for the inner encoding - pub fn next( - mut self, - field: &Field, - column_infos: &mut ColumnInfoIter, - buffers: FileBuffers, - ) -> Result> { - if self.cur_idx >= self.chain.chain.len() { - return Err(Error::invalid_input( - format!( - "The user requested a field {:?} but no decoders were registered to handle it", - field - ), - location!(), - )); - } - let item = &self.chain.chain[self.cur_idx]; - self.cur_idx += 1; - item.create_field_scheduler(field, column_infos, buffers, self) - } - - /// Restarts the decoder chain without creating a new "child" - /// - /// This can be useful, for example, when a field scheduler has - /// an inner scheduler, and the current / parent strategies might - /// apply to the inner scheduler. - /// - /// If the current / parent strategies should not be consulted - /// then call [`Self::next`] instead. - pub fn restart_at_current( - mut self, - field: &Field, - column_infos: &mut ColumnInfoIter, - buffers: FileBuffers, - ) -> Result> { - self.cur_idx = 0; - self.next(field, column_infos, buffers) - } - - /// Restarts the decoder chain for a new "child" field. The main - /// difference between this and [`Self::restart_at_current`] is that - /// this method will modify [`Self::current_path`] - pub fn new_child( - mut self, - child_idx: u32, - field: &Field, - column_infos: &mut ColumnInfoIter, - buffers: FileBuffers, - ) -> Result> { - self.path.push_back(child_idx); - self.cur_idx = 0; - match self.next(field, column_infos, buffers) { - Ok(mut next) => { - next.0.path.pop_back(); - Ok(next) - } - Err(e) => Err(e), - } - } - - /// Starts the decoding process for a field - pub(crate) fn start( - mut self, - field: &Field, - column_infos: &mut ColumnInfoIter, - buffers: FileBuffers, - ) -> Result> { - self.path.clear(); - self.cur_idx = 0; - self.next(field, column_infos, buffers) - } -} - pub struct ColumnInfoIter<'a> { column_infos: Vec>, column_indices: &'a [u32], @@ -568,82 +448,104 @@ impl<'a> ColumnInfoIter<'a> { } } -// A trait that handles the mapping from Arrow schema to field decoders. -// -// Note that the decoders can only be figured out using both the schema AND -// the column metadata. In theory, one could infer the decoder / column type -// using only the column metadata. However, field nullability would be -// missing / incorrect and its also not as easy as it sounds since pages can -// have different encodings and those encodings often have various layers. -// Also, sometimes the inference is just impossible. For example, -// Timestamp, Float64, Int64, and UInt64 will all be encoded as 8-byte value -// encoding. The only way to know the data type is to look at the schema. -// -// We also can't just guess the encoding based on the schema. This is because -// there may be multiple different ways to encode a field and it may even -// change on a page-by-page basis. -// -// For example, if a field is a struct field then we expect a header -// column that could have one of a few different encodings. -// -// This could be encoded with "simple struct" and an empty header column -// followed by the shredded child columns. It could be encoded as a nullable -// struct where the nulls are in a dense bitmap. It could even be encoded -// as a packed (row-major) struct where there is only a single column containing -// all of the data! -// -// TODO: Still lots of research to do here in different ways that -// we can map schemas to buffers. -// -// Example: repetition levels - the validity bitmaps for nested -// fields are fatter (more than one bit per row) and contain -// validity information about parent fields (e.g. is this a -// struct-struct-null or struct-null-null or null-null-null?) -// -// Examples: sentinel-shredding - instead of creating a wider -// validity bitmap we assign more sentinels to each column. So -// if the values of an int32 array have a max of 1000 then we can -// use 1001 to mean null int32 and 1002 to mean null parent. -// -// Examples: Sparse structs - the struct column has a validity -// bitmap that must be read if you plan on reading the struct -// or any nested field. However, this could be a compressed -// bitmap stored in metadata. A perk for this approach is that -// the child fields can then have a smaller size than the parent -// field. E.g. if a struct is 1000 rows and 900 of them are -// null then there is one validity bitmap of length 1000 and -// 100 rows of each of the children. -pub trait FieldDecoderStrategy: Send + Sync + std::fmt::Debug { - /// Called to create a field scheduler for a field - /// - /// Stratgies can examine: - /// * The target field - /// * The column metadata (potentially consuming multiple columns) - /// - /// If a strategy does not handle an encoding it should call - /// `chain.next` to delegate to the next strategy in the chain. - /// - /// The actual scheduler creation is asynchronous. This is because - /// the scheduler may need to read column metadata from disk. - fn create_field_scheduler<'a>( +pub trait MiniBlockDecompressor: std::fmt::Debug + Send + Sync { + fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result; +} + +pub trait FixedPerValueDecompressor: std::fmt::Debug + Send + Sync { + fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result; + fn bits_per_value(&self) -> u64; +} + +pub trait BlockDecompressor: std::fmt::Debug + Send + Sync { + fn decompress(&self, data: LanceBuffer) -> Result; +} + +pub trait DecompressorStrategy: std::fmt::Debug + Send + Sync { + fn create_miniblock_decompressor( &self, - field: &Field, - column_infos: &mut ColumnInfoIter, - buffers: FileBuffers, - chain: DecoderMiddlewareChainCursor<'a>, - ) -> Result>; + description: &pb::ArrayEncoding, + ) -> Result>; + + fn create_fixed_per_value_decompressor( + &self, + description: &pb::ArrayEncoding, + ) -> Result>; + + fn create_block_decompressor( + &self, + description: &pb::ArrayEncoding, + ) -> Result>; +} + +#[derive(Debug)] +pub struct CoreDecompressorStrategy {} + +impl DecompressorStrategy for CoreDecompressorStrategy { + fn create_miniblock_decompressor( + &self, + description: &pb::ArrayEncoding, + ) -> Result> { + match description.array_encoding.as_ref().unwrap() { + pb::array_encoding::ArrayEncoding::Flat(flat) => { + Ok(Box::new(ValueDecompressor::new(flat))) + } + _ => todo!(), + } + } + + fn create_fixed_per_value_decompressor( + &self, + description: &pb::ArrayEncoding, + ) -> Result> { + match description.array_encoding.as_ref().unwrap() { + pb::array_encoding::ArrayEncoding::Flat(flat) => { + Ok(Box::new(ValueDecompressor::new(flat))) + } + _ => todo!(), + } + } + + fn create_block_decompressor( + &self, + description: &pb::ArrayEncoding, + ) -> Result> { + match description.array_encoding.as_ref().unwrap() { + pb::array_encoding::ArrayEncoding::Flat(flat) => { + Ok(Box::new(ValueDecompressor::new(flat))) + } + pb::array_encoding::ArrayEncoding::Constant(constant) => { + let scalar = LanceBuffer::Owned(constant.value.clone()); + Ok(Box::new(ConstantDecompressor::new( + scalar, + constant.num_values, + ))) + } + _ => todo!(), + } + } } /// The core decoder strategy handles all the various Arrow types -#[derive(Debug, Default)] +#[derive(Debug)] pub struct CoreFieldDecoderStrategy { pub validate_data: bool, + pub decompressor_strategy: Arc, +} + +impl Default for CoreFieldDecoderStrategy { + fn default() -> Self { + Self { + validate_data: false, + decompressor_strategy: Arc::new(CoreDecompressorStrategy {}), + } + } } impl CoreFieldDecoderStrategy { /// This is just a sanity check to ensure there is no "wrapped encodings" /// that haven't been handled. - fn ensure_values_encoded(column_info: &ColumnInfo, path: &VecDeque) -> Result<()> { + fn ensure_values_encoded(column_info: &ColumnInfo, field_name: &str) -> Result<()> { let column_encoding = column_info .encoding .column_encoding @@ -663,7 +565,7 @@ impl CoreFieldDecoderStrategy { ) { Ok(()) } else { - Err(Error::invalid_input(format!("the column at index {} mapping to the input field at {:?} has column encoding {:?} and no decoder is registered to handle it", column_info.index, path, column_encoding), location!())) + Err(Error::invalid_input(format!("the column at index {} mapping to the input field {} has column encoding {:?} and no decoder is registered to handle it", column_info.index, field_name, column_encoding), location!())) } } @@ -682,20 +584,19 @@ impl CoreFieldDecoderStrategy { fn create_primitive_scheduler( &self, - data_type: &DataType, - path: &VecDeque, + field: &Field, column: &ColumnInfo, buffers: FileBuffers, - ) -> Result> { - Self::ensure_values_encoded(column, path)?; + ) -> Result> { + Self::ensure_values_encoded(column, &field.name)?; // Primitive fields map to a single column let column_buffers = ColumnBuffers { file_buffers: buffers, positions_and_sizes: &column.buffer_offsets_and_sizes, }; - Ok(Arc::new(PrimitiveFieldScheduler::new( + Ok(Box::new(PrimitiveFieldScheduler::new( column.index, - data_type.clone(), + field.data_type().clone(), column.page_infos.clone(), column_buffers, self.validate_data, @@ -703,8 +604,8 @@ impl CoreFieldDecoderStrategy { } /// Helper method to verify the page encoding of a struct header column - fn check_simple_struct(column_info: &ColumnInfo, path: &VecDeque) -> Result<()> { - Self::ensure_values_encoded(column_info, path)?; + fn check_simple_struct(column_info: &ColumnInfo, field_name: &str) -> Result<()> { + Self::ensure_values_encoded(column_info, field_name)?; if column_info.page_infos.len() != 1 { return Err(Error::InvalidInput { source: format!("Due to schema we expected a struct column but we received a column with {} pages and right now we only support struct columns with 1 page", column_info.page_infos.len()).into(), location: location!() }); } @@ -723,26 +624,20 @@ impl CoreFieldDecoderStrategy { ) } - fn create_list_scheduler<'a>( + fn create_list_scheduler( &self, list_field: &Field, column_infos: &mut ColumnInfoIter, buffers: FileBuffers, offsets_column: &ColumnInfo, - chain: DecoderMiddlewareChainCursor<'a>, - ) -> Result> { - Self::ensure_values_encoded(offsets_column, chain.current_path())?; + ) -> Result> { + Self::ensure_values_encoded(offsets_column, &list_field.name)?; let offsets_column_buffers = ColumnBuffers { file_buffers: buffers, positions_and_sizes: &offsets_column.buffer_offsets_and_sizes, }; - let (chain, items_scheduler) = chain.new_child( - /*child_idx=*/ 0, - &list_field.children[0], - column_infos, - buffers, - )?; - let items_scheduler = items_scheduler?; + let items_scheduler = + self.create_legacy_field_scheduler(&list_field.children[0], column_infos, buffers)?; let (inner_infos, null_offset_adjustments): (Vec<_>, Vec<_>) = offsets_column .page_infos @@ -791,16 +686,13 @@ impl CoreFieldDecoderStrategy { } else { DataType::Int64 }; - Ok(( - chain, - Ok(Arc::new(ListFieldScheduler::new( - inner, - items_scheduler, - items_field, - offset_type, - null_offset_adjustments, - )) as Arc), - )) + Ok(Box::new(ListFieldScheduler::new( + inner, + items_scheduler.into(), + items_field, + offset_type, + null_offset_adjustments, + ))) } fn unwrap_blob(column_info: &ColumnInfo) -> Option { @@ -814,38 +706,60 @@ impl CoreFieldDecoderStrategy { None } } -} -impl FieldDecoderStrategy for CoreFieldDecoderStrategy { - fn create_field_scheduler<'a>( + fn create_structural_field_scheduler( + &self, + field: &Field, + column_infos: &mut ColumnInfoIter, + ) -> Result> { + let data_type = field.data_type(); + if Self::is_primitive(&data_type) { + let column_info = column_infos.expect_next()?; + let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new( + column_info.as_ref(), + self.decompressor_strategy.as_ref(), + )?); + column_infos.next_top_level(); + return Ok(scheduler); + } + match &data_type { + DataType::Struct(fields) => { + let mut child_schedulers = Vec::with_capacity(field.children.len()); + for field in field.children.iter() { + let field_scheduler = + self.create_structural_field_scheduler(field, column_infos)?; + child_schedulers.push(field_scheduler); + } + + let fields = fields.clone(); + Ok( + Box::new(StructuralStructScheduler::new(child_schedulers, fields)) + as Box, + ) + } + _ => todo!(), + } + } + + fn create_legacy_field_scheduler( &self, field: &Field, column_infos: &mut ColumnInfoIter, buffers: FileBuffers, - chain: DecoderMiddlewareChainCursor<'a>, - ) -> Result> { + ) -> Result> { let data_type = field.data_type(); if Self::is_primitive(&data_type) { - let primitive_col = column_infos.expect_next()?; - let scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - primitive_col, - buffers, - )?; - return Ok((chain, Ok(scheduler))); + let column_info = column_infos.expect_next()?; + let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?; + return Ok(scheduler); } else if data_type.is_binary_like() { let column_info = column_infos.next().unwrap().clone(); // Column is blob and user is asking for binary data if let Some(blob_col) = Self::unwrap_blob(column_info.as_ref()) { - let desc_scheduler = self.create_primitive_scheduler( - BLOB_DESC_FIELD.data_type(), - chain.current_path(), - &blob_col, - buffers, - )?; - let blob_scheduler = Arc::new(BlobFieldScheduler::new(desc_scheduler)); - return Ok((chain, Ok(blob_scheduler))); + let desc_scheduler = + self.create_primitive_scheduler(&BLOB_DESC_LANCE_FIELD, &blob_col, buffers)?; + let blob_scheduler = Box::new(BlobFieldScheduler::new(desc_scheduler.into())); + return Ok(blob_scheduler); } if let Some(page_info) = column_info.page_infos.first() { if matches!( @@ -869,35 +783,24 @@ impl FieldDecoderStrategy for CoreFieldDecoderStrategy { field.nullable, )) .unwrap(); - let (chain, list_scheduler) = self.create_list_scheduler( + let list_scheduler = self.create_list_scheduler( &list_field, column_infos, buffers, &column_info, - chain, )?; - let binary_scheduler = Arc::new(BinaryFieldScheduler::new( - list_scheduler?, + let binary_scheduler = Box::new(BinaryFieldScheduler::new( + list_scheduler.into(), field.data_type().clone(), )); - return Ok((chain, Ok(binary_scheduler))); + return Ok(binary_scheduler); } else { - let scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - &column_info, - buffers, - )?; - return Ok((chain, Ok(scheduler))); + let scheduler = + self.create_primitive_scheduler(field, &column_info, buffers)?; + return Ok(scheduler); } } else { - let scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - &column_info, - buffers, - )?; - return Ok((chain, Ok(scheduler))); + return self.create_primitive_scheduler(field, &column_info, buffers); } } match &data_type { @@ -906,13 +809,9 @@ impl FieldDecoderStrategy for CoreFieldDecoderStrategy { // depending on the child data type. if Self::is_primitive(inner.data_type()) { let primitive_col = column_infos.expect_next()?; - let scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - primitive_col, - buffers, - )?; - Ok((chain, Ok(scheduler))) + let scheduler = + self.create_primitive_scheduler(field, primitive_col, buffers)?; + Ok(scheduler) } else { todo!() } @@ -920,13 +819,9 @@ impl FieldDecoderStrategy for CoreFieldDecoderStrategy { DataType::Dictionary(_key_type, value_type) => { if Self::is_primitive(value_type) || value_type.is_binary_like() { let primitive_col = column_infos.expect_next()?; - let scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - primitive_col, - buffers, - )?; - Ok((chain, Ok(scheduler))) + let scheduler = + self.create_primitive_scheduler(field, primitive_col, buffers)?; + Ok(scheduler) } else { Err(Error::NotSupported { source: format!( @@ -941,7 +836,7 @@ impl FieldDecoderStrategy for CoreFieldDecoderStrategy { DataType::List(_) | DataType::LargeList(_) => { let offsets_column = column_infos.expect_next()?.clone(); column_infos.next_top_level(); - self.create_list_scheduler(field, column_infos, buffers, &offsets_column, chain) + self.create_list_scheduler(field, column_infos, buffers, &offsets_column) } DataType::Struct(fields) => { let column_info = column_infos.expect_next()?; @@ -949,52 +844,32 @@ impl FieldDecoderStrategy for CoreFieldDecoderStrategy { // Column is blob and user is asking for descriptions if let Some(blob_col) = Self::unwrap_blob(column_info.as_ref()) { // Can use primitive scheduler here since descriptions are always packed struct - let desc_scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - &blob_col, - buffers, - )?; - return Ok((chain, Ok(desc_scheduler))); + return self.create_primitive_scheduler(field, &blob_col, buffers); } if Self::check_packed_struct(column_info) { // use packed struct encoding - let scheduler = self.create_primitive_scheduler( - &data_type, - chain.current_path(), - column_info, - buffers, - )?; - Ok((chain, Ok(scheduler))) + self.create_primitive_scheduler(field, column_info, buffers) } else { // use default struct encoding - Self::check_simple_struct(column_info, chain.current_path()).unwrap(); + Self::check_simple_struct(column_info, &field.name).unwrap(); let mut child_schedulers = Vec::with_capacity(field.children.len()); - let mut chain = chain; - for (i, field) in field.children.iter().enumerate() { + for field in &field.children { column_infos.next_top_level(); - let (next_chain, field_scheduler) = - chain.new_child(i as u32, field, column_infos, buffers)?; - child_schedulers.push(field_scheduler?); - chain = next_chain; + let field_scheduler = + self.create_legacy_field_scheduler(field, column_infos, buffers)?; + child_schedulers.push(Arc::from(field_scheduler)); } let fields = fields.clone(); - let struct_scheduler = Ok(Arc::new(SimpleStructScheduler::new( + Ok(Box::new(SimpleStructScheduler::new( child_schedulers, fields, - )) as Arc); - - // For now, we don't record nullability for structs. As a result, there is always - // only one "page" of struct data. In the future, this will change. A null-aware - // struct scheduler will need to first calculate how many rows are in the struct page - // and then find the child pages that overlap. This should be doable. - Ok((chain, struct_scheduler)) + ))) } } - // TODO: Still need support for dictionary / RLE - _ => chain.next(field, column_infos, buffers), + // TODO: Still need support for RLE + _ => todo!(), } } } @@ -1027,6 +902,27 @@ fn root_column(num_rows: u64) -> ColumnInfo { } } +pub enum RootDecoder { + Structural(StructuralStructDecoder), + Legacy(SimpleStructDecoder), +} + +impl RootDecoder { + pub fn into_structural(self) -> StructuralStructDecoder { + match self { + Self::Structural(decoder) => decoder, + Self::Legacy(_) => panic!("Expected a structural decoder"), + } + } + + pub fn into_legacy(self) -> SimpleStructDecoder { + match self { + Self::Legacy(decoder) => decoder, + Self::Structural(_) => panic!("Expected a legacy decoder"), + } + } +} + impl DecodeBatchScheduler { /// Creates a new decode scheduler with the expected schema and the column /// metadata of the file. @@ -1037,7 +933,7 @@ impl DecodeBatchScheduler { column_infos: &[Arc], file_buffer_positions_and_sizes: &'a Vec<(u64, u64)>, num_rows: u64, - decoder_strategy: Arc, + _decoder_plugins: Arc, io: Arc, cache: Arc, filter: &FilterExpression, @@ -1048,14 +944,6 @@ impl DecodeBatchScheduler { }; let arrow_schema = ArrowSchema::from(schema); let root_fields = arrow_schema.fields().clone(); - let mut columns = Vec::with_capacity(column_infos.len() + 1); - columns.push(Arc::new(root_column(num_rows))); - columns.extend(column_infos.iter().cloned()); - let adjusted_column_indices = [0_u32] - .into_iter() - .chain(column_indices.iter().map(|i| i.saturating_add(1))) - .collect::>(); - let mut column_iter = ColumnInfoIter::new(columns, &adjusted_column_indices); let root_type = DataType::Struct(root_fields.clone()); let mut root_field = Field::try_from(&ArrowField::new("root", root_type, false))?; // root_field.children and schema.fields should be identical at this point but the latter @@ -1065,20 +953,45 @@ impl DecodeBatchScheduler { root_field .metadata .insert("__lance_decoder_root".to_string(), "true".to_string()); - let (_, root_scheduler) = - decoder_strategy - .cursor(io.clone()) - .start(&root_field, &mut column_iter, buffers)?; - let root_scheduler = root_scheduler?; - let context = SchedulerContext::new(io, cache.clone()); - root_scheduler.initialize(filter, &context).await?; + if column_infos[0].is_structural() { + let mut column_iter = ColumnInfoIter::new(column_infos.to_vec(), column_indices); - Ok(Self { - root_scheduler, - root_fields, - cache, - }) + let mut root_scheduler = CoreFieldDecoderStrategy::default() + .create_structural_field_scheduler(&root_field, &mut column_iter)?; + + let context = SchedulerContext::new(io, cache.clone()); + root_scheduler.initialize(filter, &context).await?; + + Ok(Self { + root_scheduler: RootScheduler::Structural(root_scheduler), + root_fields, + cache, + }) + } else { + // The old encoding style expected a header column for structs and so we + // need a header column for the top-level struct + let mut columns = Vec::with_capacity(column_infos.len() + 1); + columns.push(Arc::new(root_column(num_rows))); + columns.extend(column_infos.iter().cloned()); + + let adjusted_column_indices = [0_u32] + .into_iter() + .chain(column_indices.iter().map(|i| i.saturating_add(1))) + .collect::>(); + let mut column_iter = ColumnInfoIter::new(columns, &adjusted_column_indices); + let root_scheduler = CoreFieldDecoderStrategy::default() + .create_legacy_field_scheduler(&root_field, &mut column_iter, buffers)?; + + let context = SchedulerContext::new(io, cache.clone()); + root_scheduler.initialize(filter, &context).await?; + + Ok(Self { + root_scheduler: RootScheduler::Legacy(root_scheduler.into()), + root_fields, + cache, + }) + } } pub fn from_scheduler( @@ -1087,13 +1000,57 @@ impl DecodeBatchScheduler { cache: Arc, ) -> Self { Self { - root_scheduler, + root_scheduler: RootScheduler::Legacy(root_scheduler), root_fields, cache, } } - fn do_schedule_ranges( + fn do_schedule_ranges_structural( + &mut self, + ranges: &[Range], + filter: &FilterExpression, + io: Arc, + mut schedule_action: impl FnMut(Result) -> bool, + ) { + let root_scheduler = self.root_scheduler.as_structural(); + let mut context = SchedulerContext::new(io, self.cache.clone()); + let maybe_root_job = root_scheduler.schedule_ranges(ranges, filter); + if let Err(schedule_ranges_err) = maybe_root_job { + schedule_action(Err(schedule_ranges_err)); + return; + } + let mut root_job = maybe_root_job.unwrap(); + let mut num_rows_scheduled = 0; + loop { + let maybe_next_scan_line = root_job.schedule_next(&mut context); + if let Err(err) = maybe_next_scan_line { + schedule_action(Err(err)); + return; + } + let next_scan_line = maybe_next_scan_line.unwrap(); + match next_scan_line { + Some(next_scan_line) => { + trace!( + "Scheduled scan line of {} rows and {} decoders", + next_scan_line.rows_scheduled, + next_scan_line.decoders.len() + ); + num_rows_scheduled += next_scan_line.rows_scheduled; + if !schedule_action(Ok(DecoderMessage { + scheduled_so_far: num_rows_scheduled, + decoders: next_scan_line.decoders, + })) { + // Decoder has disconnected + return; + } + } + None => return, + } + } + } + + fn do_schedule_ranges_legacy( &mut self, ranges: &[Range], filter: &FilterExpression, @@ -1104,6 +1061,7 @@ impl DecodeBatchScheduler { // tasks are scheduled at the same top level row. priority: Option>, ) { + let root_scheduler = self.root_scheduler.as_legacy(); let rows_requested = ranges.iter().map(|r| r.end - r.start).sum::(); trace!( "Scheduling {} ranges across {}..{} ({} rows){}", @@ -1118,7 +1076,7 @@ impl DecodeBatchScheduler { ); let mut context = SchedulerContext::new(io, self.cache.clone()); - let maybe_root_job = self.root_scheduler.schedule_ranges(ranges, filter); + let maybe_root_job = root_scheduler.schedule_ranges(ranges, filter); if let Err(schedule_ranges_err) = maybe_root_job { schedule_action(Err(schedule_ranges_err)); return; @@ -1150,9 +1108,30 @@ impl DecodeBatchScheduler { // Decoder has disconnected return; } + + trace!("Finished scheduling {} ranges", ranges.len()); } + } - trace!("Finished scheduling {} ranges", ranges.len()); + fn do_schedule_ranges( + &mut self, + ranges: &[Range], + filter: &FilterExpression, + io: Arc, + schedule_action: impl FnMut(Result) -> bool, + // If specified, this will be used as the top_level_row for all scheduling + // tasks. This is used by list scheduling to ensure all items scheduling + // tasks are scheduled at the same top level row. + priority: Option>, + ) { + match &self.root_scheduler { + RootScheduler::Legacy(_) => { + self.do_schedule_ranges_legacy(ranges, filter, io, schedule_action, priority) + } + RootScheduler::Structural(_) => { + self.do_schedule_ranges_structural(ranges, filter, io, schedule_action) + } + } } // This method is similar to schedule_ranges but instead of @@ -1259,18 +1238,6 @@ impl DecodeBatchScheduler { .collect::>(); self.schedule_ranges(&ranges, filter, sink, scheduler) } - - pub fn new_root_decoder_ranges(&self, ranges: &[Range]) -> SimpleStructDecoder { - let rows_to_read = ranges - .iter() - .map(|range| range.end - range.start) - .sum::(); - SimpleStructDecoder::new(self.root_fields.clone(), rows_to_read) - } - - pub fn new_root_decoder_indices(&self, indices: &[u64]) -> SimpleStructDecoder { - SimpleStructDecoder::new(self.root_fields.clone(), indices.len() as u64) - } } pub struct ReadBatchTask { @@ -1338,8 +1305,8 @@ impl BatchDecodeStream { Some(scan_line) => { let scan_line = scan_line?; self.rows_scheduled = scan_line.scheduled_so_far; - for decoder in scan_line.decoders { - self.accept_decoder(decoder)?; + for message in scan_line.decoders { + self.accept_decoder(message.into_legacy())?; } } None => { @@ -1456,6 +1423,174 @@ impl BatchDecodeStream { } } +/// A stream that takes scheduled jobs and generates decode tasks from them. +pub struct StructuralBatchDecodeStream { + context: DecoderContext, + root_decoder: StructuralStructDecoder, + rows_remaining: u64, + rows_per_batch: u32, + rows_scheduled: u64, + rows_drained: u64, + scheduler_exhuasted: bool, + emitted_batch_size_warning: Arc, +} + +impl StructuralBatchDecodeStream { + /// Create a new instance of a batch decode stream + /// + /// # Arguments + /// + /// * `scheduled` - an incoming stream of decode tasks from a + /// [`crate::decode::DecodeBatchScheduler`] + /// * `schema` - the scheam of the data to create + /// * `rows_per_batch` the number of rows to create before making a batch + /// * `num_rows` the total number of rows scheduled + /// * `num_columns` the total number of columns in the file + pub fn new( + scheduled: mpsc::UnboundedReceiver>, + rows_per_batch: u32, + num_rows: u64, + root_decoder: StructuralStructDecoder, + ) -> Self { + Self { + context: DecoderContext::new(scheduled), + root_decoder, + rows_remaining: num_rows, + rows_per_batch, + rows_scheduled: 0, + rows_drained: 0, + scheduler_exhuasted: false, + emitted_batch_size_warning: Arc::new(Once::new()), + } + } + + async fn wait_for_scheduled(&mut self, scheduled_need: u64) -> Result { + if self.scheduler_exhuasted { + return Ok(self.rows_scheduled); + } + while self.rows_scheduled < scheduled_need { + let next_message = self.context.source.recv().await; + match next_message { + Some(scan_line) => { + let scan_line = scan_line?; + self.rows_scheduled = scan_line.scheduled_so_far; + for message in scan_line.decoders { + let unloaded_page = message.into_structural(); + let loaded_page = unloaded_page.0.await?; + self.root_decoder.accept_page(loaded_page)?; + } + } + None => { + // Schedule ended before we got all the data we expected. This probably + // means some kind of pushdown filter was applied and we didn't load as + // much data as we thought we would. + self.scheduler_exhuasted = true; + return Ok(self.rows_scheduled); + } + } + } + Ok(scheduled_need) + } + + #[instrument(level = "debug", skip_all)] + async fn next_batch_task(&mut self) -> Result> { + trace!( + "Draining batch task (rows_remaining={} rows_drained={} rows_scheduled={})", + self.rows_remaining, + self.rows_drained, + self.rows_scheduled, + ); + if self.rows_remaining == 0 { + return Ok(None); + } + + let mut to_take = self.rows_remaining.min(self.rows_per_batch as u64); + self.rows_remaining -= to_take; + + let scheduled_need = (self.rows_drained + to_take).saturating_sub(self.rows_scheduled); + trace!("scheduled_need = {} because rows_drained = {} and to_take = {} and rows_scheduled = {}", scheduled_need, self.rows_drained, to_take, self.rows_scheduled); + if scheduled_need > 0 { + let desired_scheduled = scheduled_need + self.rows_scheduled; + trace!( + "Draining from scheduler (desire at least {} scheduled rows)", + desired_scheduled + ); + let actually_scheduled = self.wait_for_scheduled(desired_scheduled).await?; + if actually_scheduled < desired_scheduled { + let under_scheduled = desired_scheduled - actually_scheduled; + to_take -= under_scheduled; + } + } + + if to_take == 0 { + return Ok(None); + } + + let next_task = self.root_decoder.drain(to_take)?; + let next_task = NextDecodeTask { + has_more: self.rows_remaining > 0, + num_rows: to_take, + task: Box::new(next_task), + }; + self.rows_drained += to_take; + Ok(Some(next_task)) + } + + #[instrument(level = "debug", skip_all)] + fn task_to_batch( + task: NextDecodeTask, + emitted_batch_size_warning: Arc, + ) -> Result { + let struct_arr = task.task.decode(); + match struct_arr { + Ok(struct_arr) => { + let batch = RecordBatch::from(struct_arr.as_struct()); + let size_bytes = batch.get_array_memory_size() as u64; + if size_bytes > BATCH_SIZE_BYTES_WARNING { + emitted_batch_size_warning.call_once(|| { + let size_mb = size_bytes / 1024 / 1024; + debug!("Lance read in a single batch that contained more than {}MiB of data. You may want to consider reducing the batch size.", size_mb); + }); + } + Ok(batch) + } + Err(e) => { + let e = Error::Internal { + message: format!("Error decoding batch: {}", e), + location: location!(), + }; + Err(e) + } + } + } + + pub fn into_stream(self) -> BoxStream<'static, ReadBatchTask> { + let stream = futures::stream::unfold(self, |mut slf| async move { + let next_task = slf.next_batch_task().await; + let next_task = next_task.transpose().map(|next_task| { + let num_rows = next_task.as_ref().map(|t| t.num_rows).unwrap_or(0); + let emitted_batch_size_warning = slf.emitted_batch_size_warning.clone(); + let task = tokio::spawn(async move { + let next_task = next_task?; + Self::task_to_batch(next_task, emitted_batch_size_warning) + }); + (task, num_rows) + }); + next_task.map(|(task, num_rows)| { + let task = task.map(|join_wrapper| join_wrapper.unwrap()).boxed(); + // This should be true since batch size is u32 + debug_assert!(num_rows <= u32::MAX as u64); + let next_task = ReadBatchTask { + task, + num_rows: num_rows as u32, + }; + (next_task, slf) + }) + }); + stream.boxed() + } +} + #[derive(Debug)] pub enum RequestedRows { Ranges(Vec>), @@ -1473,10 +1608,11 @@ impl RequestedRows { #[derive(Debug, Clone)] pub struct SchedulerDecoderConfig { - pub decoder_strategy: Arc, + pub decoder_plugins: Arc, pub batch_size: u32, pub io: Arc, pub cache: Arc, + pub should_validate: bool, } fn check_scheduler_on_drop( @@ -1499,6 +1635,28 @@ fn check_scheduler_on_drop( stream.chain(check_scheduler).boxed() } +pub fn create_decode_stream( + schema: &Schema, + num_rows: u64, + batch_size: u32, + is_structural: bool, + should_validate: bool, + rx: mpsc::UnboundedReceiver>, +) -> BoxStream<'static, ReadBatchTask> { + if is_structural { + let arrow_schema = ArrowSchema::from(schema); + let structural_decoder = + StructuralStructDecoder::new(arrow_schema.fields.clone(), should_validate); + StructuralBatchDecodeStream::new(rx, batch_size, num_rows, structural_decoder).into_stream() + } else { + let arrow_schema = ArrowSchema::from(schema); + let root_fields = arrow_schema.fields.clone(); + + let simple_struct_decoder = SimpleStructDecoder::new(root_fields, num_rows); + BatchDecodeStream::new(rx, batch_size, num_rows, simple_struct_decoder).into_stream() + } +} + async fn create_scheduler_decoder( column_infos: Vec>, requested_rows: RequestedRows, @@ -1515,20 +1673,26 @@ async fn create_scheduler_decoder( &column_infos, &vec![], num_rows, - config.decoder_strategy, + config.decoder_plugins, config.io.clone(), config.cache, &filter, ) .await?; - let root_decoder = match &requested_rows { - RequestedRows::Ranges(ranges) => decode_scheduler.new_root_decoder_ranges(ranges), - RequestedRows::Indices(indices) => decode_scheduler.new_root_decoder_indices(indices), - }; + let is_structural = column_infos[0].is_structural(); let (tx, rx) = mpsc::unbounded_channel(); + let decode_stream = create_decode_stream( + &target_schema, + num_rows, + config.batch_size, + is_structural, + config.should_validate, + rx, + ); + let io = config.io; let scheduler_handle = tokio::task::spawn(async move { match requested_rows { @@ -1541,9 +1705,6 @@ async fn create_scheduler_decoder( } }); - let decode_stream = - BatchDecodeStream::new(rx, config.batch_size, num_rows, root_decoder).into_stream(); - Ok(check_scheduler_on_drop(decode_stream, scheduler_handle)) } @@ -1829,6 +1990,10 @@ impl SchedulerContext { } } + pub fn current_path(&self) -> VecDeque { + VecDeque::from_iter(self.path.iter().copied()) + } + pub fn locate_decoder(&mut self, decoder: Box) -> DecoderReady { trace!( "Scheduling decoder of type {:?} for {:?}", @@ -1837,15 +2002,23 @@ impl SchedulerContext { ); DecoderReady { decoder, - path: VecDeque::from_iter(self.path.iter().copied()), + path: self.current_path(), } } } +pub struct UnloadedPage(pub BoxFuture<'static, Result>); + +impl std::fmt::Debug for UnloadedPage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("UnloadedPage").finish() + } +} + #[derive(Debug)] pub struct ScheduledScanLine { pub rows_scheduled: u64, - pub decoders: Vec, + pub decoders: Vec, } pub trait SchedulingJob: std::fmt::Debug { @@ -1858,6 +2031,13 @@ pub trait SchedulingJob: std::fmt::Debug { fn num_rows(&self) -> u64; } +pub trait StructuralSchedulingJob: std::fmt::Debug { + fn schedule_next( + &mut self, + context: &mut SchedulerContext, + ) -> Result>; +} + /// A filter expression to apply to the data /// /// The core decoders do not currently take advantage of filtering in @@ -1926,12 +2106,31 @@ pub trait FieldScheduler: Send + Sync + std::fmt::Debug { fn num_rows(&self) -> u64; } +pub trait StructuralFieldScheduler: Send + std::fmt::Debug { + fn initialize<'a>( + &'a mut self, + filter: &'a FilterExpression, + context: &'a SchedulerContext, + ) -> BoxFuture<'a, Result<()>>; + fn schedule_ranges<'a>( + &'a self, + ranges: &[Range], + filter: &FilterExpression, + ) -> Result>; +} + /// A trait for tasks that decode data into an Arrow array pub trait DecodeArrayTask: Send { /// Decodes the data into an Arrow array fn decode(self: Box) -> Result; } +impl DecodeArrayTask for Box { + fn decode(self: Box) -> Result { + StructuralDecodeArrayTask::decode(*self).map(|decoded_array| decoded_array.array) + } +} + /// A task to decode data into an Arrow array pub struct NextDecodeTask { /// The decode task itself @@ -1967,9 +2166,45 @@ pub struct DecoderReady { pub path: VecDeque, } +// An envelope to wrap both 2.0 style messages and 2.1 style messages so we can +// share some code paths between the two. Decoders can safely unwrap into whatever +// style they expect since a file will be either all-2.0 or all-2.1 +#[derive(Debug)] +pub enum MessageType { + // The older v2.0 scheduler/decoder used a scheme where the message was the + // decoder itself. The messages were not sent in priority order and the decoder + // had to wait for I/O, figuring out the correct priority. This was a lot of + // complexity. + DecoderReady(DecoderReady), + // Starting in 2.1 we use a simpler scheme where the scheduling happens in priority + // order and the message is an unloaded decoder. These can be awaited, in order, and + // the decoder does not have to worry about waiting for I/O. + UnloadedPage(UnloadedPage), +} + +impl MessageType { + pub fn into_legacy(self) -> DecoderReady { + match self { + Self::DecoderReady(decoder) => decoder, + Self::UnloadedPage(_) => { + panic!("Expected DecoderReady but got UnloadedPage") + } + } + } + + pub fn into_structural(self) -> UnloadedPage { + match self { + Self::UnloadedPage(unloaded) => unloaded, + Self::DecoderReady(_) => { + panic!("Expected UnloadedPage but got DecoderReady") + } + } + } +} + pub struct DecoderMessage { pub scheduled_so_far: u64, - pub decoders: Vec, + pub decoders: Vec, } pub struct DecoderContext { @@ -2026,11 +2261,78 @@ pub trait LogicalPageDecoder: std::fmt::Debug + Send { fn data_type(&self) -> &DataType; } +pub struct DecodedPage { + pub data: DataBlock, + pub repetition: Option, + pub definition: Option, +} + +pub trait DecodePageTask: Send + std::fmt::Debug { + /// Decodes the data into an Arrow array + fn decode(self: Box) -> Result; +} + +pub trait StructuralPageDecoder: std::fmt::Debug + Send { + fn drain(&mut self, num_rows: u64) -> Result>; + fn num_rows(&self) -> u64; +} + +#[derive(Debug)] +pub struct LoadedPage { + // The decoder that is ready to be decoded + pub decoder: Box, + // The path to the decoder, the first value is the column index + // following values, if present, are nested child indices + // + // For example, a path of [1, 1, 0] would mean to grab the second + // column, then the second child, and then the first child. + // + // It could represent x in the following schema: + // + // score: float64 + // points: struct + // color: string + // location: struct + // x: float64 + // + // Currently, only struct decoders have "children" although other + // decoders may at some point as well. List children are only + // handled through indirect I/O at the moment and so they don't + // need to be represented (yet) + pub path: VecDeque, + pub page_index: usize, +} + +pub struct DecodedArray { + pub array: ArrayRef, + pub repdef: RepDefUnraveler, +} + +pub trait StructuralDecodeArrayTask: std::fmt::Debug + Send { + fn decode(self: Box) -> Result; +} + +pub trait StructuralFieldDecoder: std::fmt::Debug + Send { + /// Add a newly scheduled child decoder + /// + /// The default implementation does not expect children and returns + /// an error. + fn accept_page(&mut self, _child: LoadedPage) -> Result<()>; + /// Creates a task to decode `num_rows` of data into an array + fn drain(&mut self, num_rows: u64) -> Result>; + /// The data type of the decoded data + fn data_type(&self) -> &DataType; +} + +#[derive(Debug, Default)] +pub struct DecoderPlugins {} + /// Decodes a batch of data from an in-memory structure created by [`crate::encoder::encode_batch`] pub async fn decode_batch( batch: &EncodedBatch, filter: &FilterExpression, - field_decoder_strategy: Arc, + decoder_plugins: Arc, + should_validate: bool, ) -> Result { // The io is synchronous so it shouldn't be possible for any async stuff to still be in progress // Still, if we just use now_or_never we hit misfires because some futures (channels) need to be @@ -2047,7 +2349,7 @@ pub async fn decode_batch( &batch.page_table, &vec![], batch.num_rows, - field_decoder_strategy, + decoder_plugins, io_scheduler.clone(), cache, filter, @@ -2055,8 +2357,14 @@ pub async fn decode_batch( .await?; let (tx, rx) = unbounded_channel(); decode_scheduler.schedule_range(0..batch.num_rows, filter, tx, io_scheduler); - #[allow(clippy::single_range_in_vec_init)] - let root_decoder = decode_scheduler.new_root_decoder_ranges(&[0..batch.num_rows]); - let stream = BatchDecodeStream::new(rx, batch.num_rows as u32, batch.num_rows, root_decoder); - stream.into_stream().next().await.unwrap().task.await + let is_structural = false; + let mut decode_stream = create_decode_stream( + &batch.schema, + batch.num_rows, + batch.num_rows as u32, + is_structural, + should_validate, + rx, + ); + decode_stream.next().await.unwrap().task.await } diff --git a/rust/lance-encoding/src/encoder.rs b/rust/lance-encoding/src/encoder.rs index 4351c47a98..5343f353dd 100644 --- a/rust/lance-encoding/src/encoder.rs +++ b/rust/lance-encoding/src/encoder.rs @@ -130,7 +130,7 @@ pub trait ArrayEncoder: std::fmt::Debug + Send + Sync { ) -> Result; } -pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 4; +pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6; pub const MAX_MINIBLOCK_VALUES: u64 = 4096; /// Page data that has been compressed into a series of chunks put into @@ -156,7 +156,7 @@ pub struct MiniBlockCompressed { pub struct MiniBlockChunk { // The number of bytes that make up the chunk // - // This value must be less than or equal to 8Ki - 4 (8188) + // This value must be less than or equal to 8Ki - 6 (8188) pub num_bytes: u16, // The log (base 2) of the number of values in the chunk. If this is the final chunk // then this should be 0 (the number of values will be calculated by subtracting the diff --git a/rust/lance-encoding/src/encodings/logical/binary.rs b/rust/lance-encoding/src/encodings/logical/binary.rs index 1f6cc187b2..1791f31b15 100644 --- a/rust/lance-encoding/src/encodings/logical/binary.rs +++ b/rust/lance-encoding/src/encodings/logical/binary.rs @@ -16,7 +16,7 @@ use log::trace; use crate::decoder::{ DecodeArrayTask, DecoderReady, FieldScheduler, FilterExpression, LogicalPageDecoder, - NextDecodeTask, PriorityRange, ScheduledScanLine, SchedulerContext, SchedulingJob, + MessageType, NextDecodeTask, PriorityRange, ScheduledScanLine, SchedulerContext, SchedulingJob, }; /// Wraps a varbin scheduler and uses a BinaryPageDecoder to cast @@ -37,12 +37,15 @@ impl<'a> SchedulingJob for BinarySchedulingJob<'a> { let wrapped_decoders = inner_scan .decoders .into_iter() - .map(|decoder| DecoderReady { - path: decoder.path, - decoder: Box::new(BinaryPageDecoder { - inner: decoder.decoder, - data_type: self.scheduler.data_type.clone(), - }), + .map(|message| { + let decoder = message.into_legacy(); + MessageType::DecoderReady(DecoderReady { + path: decoder.path, + decoder: Box::new(BinaryPageDecoder { + inner: decoder.decoder, + data_type: self.scheduler.data_type.clone(), + }), + }) }) .collect::>(); Ok(ScheduledScanLine { diff --git a/rust/lance-encoding/src/encodings/logical/blob.rs b/rust/lance-encoding/src/encodings/logical/blob.rs index 36b8b6881d..91f161590b 100644 --- a/rust/lance-encoding/src/encodings/logical/blob.rs +++ b/rust/lance-encoding/src/encodings/logical/blob.rs @@ -19,7 +19,8 @@ use crate::{ buffer::LanceBuffer, decoder::{ DecodeArrayTask, DecoderReady, FieldScheduler, FilterExpression, LogicalPageDecoder, - NextDecodeTask, PriorityRange, ScheduledScanLine, SchedulerContext, SchedulingJob, + MessageType, NextDecodeTask, PriorityRange, ScheduledScanLine, SchedulerContext, + SchedulingJob, }, encoder::{EncodeTask, FieldEncoder, OutOfLineBuffers}, format::pb::{column_encoding, Blob, ColumnEncoding}, @@ -65,6 +66,7 @@ impl<'a> SchedulingJob for BlobFieldSchedulingJob<'a> { let next_descriptions = self.descriptions_job.schedule_next(context, priority)?; let mut priority = priority.current_priority(); let decoders = next_descriptions.decoders.into_iter().map(|decoder| { + let decoder = decoder.into_legacy(); let path = decoder.path; let mut decoder = decoder.decoder; let num_rows = decoder.num_rows(); @@ -90,7 +92,7 @@ impl<'a> SchedulingJob for BlobFieldSchedulingJob<'a> { base_priority: priority, }); priority += num_rows; - DecoderReady { decoder, path } + MessageType::DecoderReady(DecoderReady { decoder, path }) }); Ok(ScheduledScanLine { decoders: decoders.collect(), @@ -416,6 +418,7 @@ pub mod tests { use crate::{ format::pb::column_encoding, testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; lazy_static::lazy_static! { @@ -429,7 +432,7 @@ pub mod tests { #[test_log::test(tokio::test)] async fn test_blob() { let field = Field::new("", DataType::LargeBinary, false).with_metadata(BLOB_META.clone()); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] diff --git a/rust/lance-encoding/src/encodings/logical/list.rs b/rust/lance-encoding/src/encodings/logical/list.rs index 54f94693a6..a5c32aeff4 100644 --- a/rust/lance-encoding/src/encodings/logical/list.rs +++ b/rust/lance-encoding/src/encodings/logical/list.rs @@ -23,8 +23,8 @@ use crate::{ data::{BlockInfo, DataBlock, FixedWidthDataBlock, UsedEncoding}, decoder::{ DecodeArrayTask, DecodeBatchScheduler, FieldScheduler, FilterExpression, ListPriorityRange, - LogicalPageDecoder, NextDecodeTask, PageEncoding, PriorityRange, ScheduledScanLine, - SchedulerContext, SchedulingJob, + LogicalPageDecoder, MessageType, NextDecodeTask, PageEncoding, PriorityRange, + ScheduledScanLine, SchedulerContext, SchedulingJob, }, encoder::{ ArrayEncoder, EncodeTask, EncodedArray, EncodedColumn, EncodedPage, FieldEncoder, @@ -357,14 +357,18 @@ async fn indirect_schedule_task( }); } let item_ranges = item_ranges.into_iter().collect::>(); + let num_items = item_ranges.iter().map(|r| r.end - r.start).sum::(); // Create a new root scheduler, which has one column, which is our items data let root_fields = Fields::from(vec![Field::new("item", items_type, true)]); let indirect_root_scheduler = SimpleStructScheduler::new(vec![items_scheduler], root_fields.clone()); - let mut indirect_scheduler = - DecodeBatchScheduler::from_scheduler(Arc::new(indirect_root_scheduler), root_fields, cache); - let mut root_decoder = indirect_scheduler.new_root_decoder_ranges(&item_ranges); + let mut indirect_scheduler = DecodeBatchScheduler::from_scheduler( + Arc::new(indirect_root_scheduler), + root_fields.clone(), + cache, + ); + let mut root_decoder = SimpleStructDecoder::new(root_fields, num_items); let priority = Box::new(ListPriorityRange::new(priority, offsets.clone())); @@ -378,6 +382,7 @@ async fn indirect_schedule_task( for message in indirect_messages { for decoder in message.decoders { + let decoder = decoder.into_legacy(); if !decoder.path.is_empty() { root_decoder.accept_child(decoder)?; } @@ -424,7 +429,7 @@ impl<'a> SchedulingJob for ListFieldSchedulingJob<'a> { &mut self, context: &mut SchedulerContext, priority: &dyn PriorityRange, - ) -> Result { + ) -> Result { let next_offsets = self.offsets.schedule_next(context, priority)?; let offsets_scheduled = next_offsets.rows_scheduled; let list_reqs = self.list_requests_iter.next(offsets_scheduled); @@ -441,7 +446,13 @@ impl<'a> SchedulingJob for ListFieldSchedulingJob<'a> { .all(|req| req.null_offset_adjustment == null_offset_adjustment)); let num_rows = list_reqs.iter().map(|req| req.num_lists).sum::(); // offsets is a uint64 which is guaranteed to create one decoder on each call to schedule_next - let next_offsets_decoder = next_offsets.decoders.into_iter().next().unwrap().decoder; + let next_offsets_decoder = next_offsets + .decoders + .into_iter() + .next() + .unwrap() + .into_legacy() + .decoder; let items_scheduler = self.scheduler.items_scheduler.clone(); let items_type = self.scheduler.items_field.data_type().clone(); @@ -475,7 +486,7 @@ impl<'a> SchedulingJob for ListFieldSchedulingJob<'a> { }); let decoder = context.locate_decoder(decoder); Ok(ScheduledScanLine { - decoders: vec![decoder], + decoders: vec![MessageType::DecoderReady(decoder)], rows_scheduled: num_rows, }) } @@ -1252,8 +1263,9 @@ mod tests { use arrow_buffer::{OffsetBuffer, ScalarBuffer}; use arrow_schema::{DataType, Field, Fields}; - use crate::testing::{ - check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases, + use crate::{ + testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; fn make_list_type(inner_type: DataType) -> DataType { @@ -1267,25 +1279,25 @@ mod tests { #[test_log::test(tokio::test)] async fn test_list() { let field = Field::new("", make_list_type(DataType::Int32), true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_large_list() { let field = Field::new("", make_large_list_type(DataType::Int32), true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_nested_strings() { let field = Field::new("", make_list_type(DataType::Utf8), true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_nested_list() { let field = Field::new("", make_list_type(make_list_type(DataType::Int32)), true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] @@ -1297,7 +1309,7 @@ mod tests { )])); let field = Field::new("", make_list_type(struct_type), true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index c59067124b..6dc771be61 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -1,14 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{fmt::Debug, iter, ops::Range, sync::Arc, vec}; +use std::{collections::VecDeque, fmt::Debug, iter, ops::Range, sync::Arc, vec}; use arrow::array::AsArray; use arrow_array::{make_array, Array, ArrayRef}; use arrow_buffer::{bit_util, BooleanBuffer, NullBuffer}; -use arrow_schema::DataType; -use futures::{future::BoxFuture, FutureExt}; -use lance_arrow::deepcopy::deep_copy_array; +use arrow_schema::{DataType, Field as ArrowField}; +use futures::{future::BoxFuture, stream::FuturesUnordered, FutureExt, TryStreamExt}; +use lance_arrow::{deepcopy::deep_copy_array, DataTypeExt}; use log::{debug, trace}; use snafu::{location, Location}; @@ -16,11 +16,14 @@ use lance_core::{datatypes::Field, utils::tokio::spawn_cpu, Result}; use crate::{ buffer::LanceBuffer, - data::{BlockInfo, DataBlock, FixedWidthDataBlock, UsedEncoding}, + data::{BlockInfo, DataBlock, DataBlockBuilder, FixedWidthDataBlock, UsedEncoding}, decoder::{ - DecodeArrayTask, FieldScheduler, FilterExpression, LogicalPageDecoder, NextDecodeTask, - PageEncoding, PageInfo, PageScheduler, PrimitivePageDecoder, PriorityRange, - ScheduledScanLine, SchedulerContext, SchedulingJob, + BlockDecompressor, ColumnInfo, DecodeArrayTask, DecodePageTask, DecodedArray, DecodedPage, + DecompressorStrategy, FieldScheduler, FilterExpression, LoadedPage, LogicalPageDecoder, + MessageType, MiniBlockDecompressor, NextDecodeTask, PageEncoding, PageInfo, PageScheduler, + PrimitivePageDecoder, PriorityRange, ScheduledScanLine, SchedulerContext, SchedulingJob, + StructuralDecodeArrayTask, StructuralFieldDecoder, StructuralFieldScheduler, + StructuralPageDecoder, StructuralSchedulingJob, UnloadedPage, }, encoder::{ ArrayEncodingStrategy, CompressionStrategy, EncodeTask, EncodedColumn, EncodedPage, @@ -28,7 +31,8 @@ use crate::{ }, encodings::physical::{decoder_from_array_encoding, ColumnBuffers, PageBuffers}, format::{pb, ProtobufUtils}, - repdef::{LevelBuffer, RepDefBuilder}, + repdef::{LevelBuffer, RepDefBuilder, RepDefUnraveler}, + EncodingsIo, }; #[derive(Debug)] @@ -205,7 +209,7 @@ impl<'a> SchedulingJob for PrimitiveFieldSchedulingJob<'a> { let decoder = Box::new(logical_decoder); let decoder_ready = context.locate_decoder(decoder); Ok(ScheduledScanLine { - decoders: vec![decoder_ready], + decoders: vec![MessageType::DecoderReady(decoder_ready)], rows_scheduled: num_rows_in_next, }) } @@ -242,6 +246,727 @@ impl FieldScheduler for PrimitiveFieldScheduler { } } +/// A trait for figuring out how to schedule the data within +/// a single page. +trait StructuralPageScheduler: std::fmt::Debug + Send { + /// Fetches any metadata required for the page + fn initialize<'a>(&'a mut self, io: &Arc) -> BoxFuture<'a, Result<()>>; + /// Schedules the read of the given ranges in the page + fn schedule_ranges( + &self, + ranges: &[Range], + io: &dyn EncodingsIo, + ) -> Result>>>; +} + +/// Metadata describing the decoded size of a mini-block +#[derive(Debug)] +struct ChunkMeta { + num_values: u64, + chunk_size_bytes: u64, +} + +/// A task to decode a one or more mini-blocks of data into an output batch +/// +/// Note: Two batches might share the same mini-block of data. When this happens +/// then each batch gets a copy of the block and each batch decodes the block independently. +/// +/// This means we have duplicated work but it is necessary to avoid having to synchronize +/// the decoding of the block. (TODO: test this theory) +#[derive(Debug)] +struct DecodeMiniBlockTask { + // The decompressors for the rep, def, and value buffers + rep_decompressor: Arc, + def_decompressor: Arc, + value_decompressor: Arc, + // The mini-blocks to decode + // + // For each mini-block we also have the ranges of rows that we want to decode + // from that mini-block. For example, if the user asks for rows 10, 10000, and 20000 + // then we will have three chunks here and each chunk will have a small range of 1 row. + chunks: Vec, + // The offset into the first chunk that we want to start decoding from + offset_into_first_chunk: u64, + // The total number of rows that we are decoding + num_rows: u64, +} + +impl DecodeMiniBlockTask { + fn decode_levels( + rep_decompressor: &dyn BlockDecompressor, + levels: LanceBuffer, + ) -> Result>> { + let rep = rep_decompressor.decompress(levels)?; + match rep { + DataBlock::FixedWidth(mut rep) => Ok(Some(rep.data.borrow_to_typed_slice::())), + DataBlock::Constant(constant) => { + assert_eq!(constant.data.len(), 2); + if constant.data[0] == 0 && constant.data[1] == 0 { + Ok(None) + } else { + // Maybe in the future we will encode all-null def or + // constant rep (all 1-item lists?) in a constant encoding + // but that doesn't happen today so we don't need to worry. + todo!() + } + } + _ => unreachable!(), + } + } + + // We are building a LevelBuffer (levels) and want to copy into it `total_len` + // values from `level_buf` starting at `offset`. + // + // We need to handle both the case where `levels` is None (no nulls encountered + // yet) and the case where `level_buf` is None (the input we are copying from has + // no nulls) + fn extend_levels( + offset: usize, + range: Range, + levels: &mut Option, + level_buf: &Option>, + dest_offset: usize, + ) { + if let Some(level_buf) = level_buf { + if levels.is_none() { + // This is the first non-empty def buf we've hit, fill in the past + // with 0 (valid) + let mut new_levels_vec = + LevelBuffer::with_capacity(offset + (range.end - range.start) as usize); + new_levels_vec.extend(iter::repeat(0).take(dest_offset)); + *levels = Some(new_levels_vec); + } + levels.as_mut().unwrap().extend( + level_buf.as_ref()[range.start as usize..range.end as usize] + .iter() + .copied(), + ); + } else if let Some(levels) = levels { + let num_values = (range.end - range.start) as usize; + // This is an all-valid level_buf but we had nulls earlier and so we + // need to materialize it + levels.extend(iter::repeat(0).take(num_values)); + } + } +} + +impl DecodePageTask for DecodeMiniBlockTask { + fn decode(self: Box) -> Result { + // First, we create output buffers for the rep and def and data + let mut repbuf: Option = None; + let mut defbuf: Option = None; + let rep_decompressor = self.rep_decompressor; + let def_decompressor = self.def_decompressor; + + let mut remaining = self.num_rows; + let estimated_size_bytes = self + .chunks + .iter() + .map(|chunk| chunk.data.len()) + .sum::() + * 2; + let mut data_builder = + DataBlockBuilder::with_capacity_estimate(estimated_size_bytes as u64); + let mut to_skip = self.offset_into_first_chunk; + // We need to keep track of the offset into repbuf/defbuf that we are building up + let mut level_offset = 0; + // Now we iterate through each chunk and decode the data into the output buffers + for chunk in self.chunks.into_iter() { + // We always decode the entire chunk + let buf = chunk.data.into_buffer(); + // The first 4 bytes describe the size of the rep/def buffers + let bytes_rep = u16::from_le_bytes([buf[0], buf[1]]) as usize; + let bytes_def = u16::from_le_bytes([buf[2], buf[3]]) as usize; + let bytes_val = u16::from_le_bytes([buf[4], buf[5]]) as usize; + debug_assert!(buf.len() >= bytes_rep + bytes_def + bytes_val + 6); + debug_assert!( + buf.len() <= bytes_rep + bytes_def + bytes_val + 6 + MINIBLOCK_MAX_PADDING as usize + ); + let rep = buf.slice_with_length(6, bytes_rep); + let def = buf.slice_with_length(6 + bytes_rep, bytes_def); + let values = buf.slice_with_length(6 + bytes_rep + bytes_def, bytes_val); + + let values = self + .value_decompressor + .decompress(LanceBuffer::Borrowed(values), chunk.vals_in_chunk)?; + + let rep = Self::decode_levels(rep_decompressor.as_ref(), LanceBuffer::Borrowed(rep))?; + let def = Self::decode_levels(def_decompressor.as_ref(), LanceBuffer::Borrowed(def))?; + + // We've decoded the entire block. Now we need to factor in: + // - The offset into the first chunk + // - The ranges the user asked for + // - The total # of rows in this task + // + // From these we can figure out which values to keep. + // + // For example, maybe we've are asked to decode 100 rows, with an offset of 50, from + // a block with 1024 values, and the user asked for the ranges 400..500 and 600..700 + // + // In this case we want to take the values 450..500 and 600..650 from the block. + let mut offset = to_skip; + for range in chunk.ranges { + if to_skip > range.end - range.start { + to_skip -= range.end - range.start; + continue; + } + // Substract skip from start of range + let range = range.start + to_skip..range.end; + to_skip = 0; + + // Truncate range to fit remaining + let range_len = range.end - range.start; + let to_take = range_len.min(remaining); + let range = range.start..range.start + to_take; + + // Grab values and add to what we are building + Self::extend_levels( + offset as usize, + range.clone(), + &mut repbuf, + &rep, + level_offset, + ); + Self::extend_levels( + offset as usize, + range.clone(), + &mut defbuf, + &def, + level_offset, + ); + data_builder.append(&values, range); + remaining -= to_take; + offset += to_take; + level_offset += to_take as usize; + } + } + debug_assert_eq!(remaining, 0); + + let data = data_builder.finish(); + + Ok(DecodedPage { + data, + repetition: repbuf, + definition: defbuf, + }) + } +} + +/// Decodes mini-block formatted data. See [`PrimitiveStructuralEncoder`] for more +/// details on the different layouts. +#[derive(Debug)] +struct MiniBlockDecoder { + rep_decompressor: Arc, + def_decompressor: Arc, + value_decompressor: Arc, + data: VecDeque, + offset_in_current_chunk: u64, + num_rows: u64, +} + +impl StructuralPageDecoder for MiniBlockDecoder { + fn drain(&mut self, num_rows: u64) -> Result> { + let mut remaining = num_rows; + let mut chunks = Vec::new(); + let offset_into_first_chunk = self.offset_in_current_chunk; + while remaining > 0 { + if remaining >= self.data.front().unwrap().vals_in_chunk - self.offset_in_current_chunk + { + // We are fully consuming the next chunk + let chunk = self.data.pop_front().unwrap(); + remaining -= chunk.vals_in_chunk - self.offset_in_current_chunk; + chunks.push(chunk); + self.offset_in_current_chunk = 0; + } else { + // We are partially consuming the next chunk + let chunk = self.data.front().unwrap().clone(); + self.offset_in_current_chunk += remaining; + debug_assert!(self.offset_in_current_chunk > 0); + remaining = 0; + chunks.push(chunk); + } + } + Ok(Box::new(DecodeMiniBlockTask { + chunks, + rep_decompressor: self.rep_decompressor.clone(), + def_decompressor: self.def_decompressor.clone(), + value_decompressor: self.value_decompressor.clone(), + num_rows, + offset_into_first_chunk, + })) + } + + fn num_rows(&self) -> u64 { + self.num_rows + } +} + +/// A scheduler for all-null data +/// +/// Note that all-null data might still require buffers. If there are definition levels +/// then we need to distinguish between null structs and null values. If there are repetition +/// levels then we need to distinguish between null lists, lists of null, and empty lists. +#[derive(Debug, Default)] +pub struct AllNullScheduler {} + +impl StructuralPageScheduler for AllNullScheduler { + fn initialize<'a>(&'a mut self, _io: &Arc) -> BoxFuture<'a, Result<()>> { + std::future::ready(Ok(())).boxed() + } + + fn schedule_ranges( + &self, + _ranges: &[Range], + _io: &dyn EncodingsIo, + ) -> Result>>> { + Ok(std::future::ready(Ok( + Box::new(AllNullPageDecoder {}) as Box + )) + .boxed()) + } +} + +#[derive(Debug)] +struct AllNullDecodePageTask {} +impl DecodePageTask for AllNullDecodePageTask { + fn decode(self: Box) -> Result { + // TODO: Not that trivial, we might have rep/def that we need to encode / decode still + todo!() + } +} + +#[derive(Debug)] +pub struct AllNullPageDecoder {} + +impl StructuralPageDecoder for AllNullPageDecoder { + fn drain(&mut self, _num_rows: u64) -> Result> { + todo!() + } + + fn num_rows(&self) -> u64 { + todo!() + } +} + +/// A scheduler for a page that has been encoded with the mini-block layout +#[derive(Debug)] +pub struct MiniBlockScheduler { + // These come from the protobuf + meta_buf_position: u64, + meta_buf_size: u64, + data_buf_position: u64, + priority: u64, + rows_in_page: u64, + rep_decompressor: Arc, + def_decompressor: Arc, + value_decompressor: Arc, + // This is set after initialization + chunk_meta: Vec, +} + +impl MiniBlockScheduler { + fn try_new( + buffer_offsets_and_sizes: &[(u64, u64)], + priority: u64, + rows_in_page: u64, + layout: &pb::MiniBlockLayout, + decompressors: &dyn DecompressorStrategy, + ) -> Result { + let (meta_buf_position, meta_buf_size) = buffer_offsets_and_sizes[0]; + // We don't use the data buf size since we can get it from the metadata + let (data_buf_position, _) = buffer_offsets_and_sizes[1]; + let rep_decompressor = + decompressors.create_block_decompressor(layout.rep_compression.as_ref().unwrap())?; + let def_decompressor = + decompressors.create_block_decompressor(layout.def_compression.as_ref().unwrap())?; + let value_decompressor = decompressors + .create_miniblock_decompressor(layout.value_compression.as_ref().unwrap())?; + Ok(Self { + meta_buf_position, + meta_buf_size, + data_buf_position, + rep_decompressor: rep_decompressor.into(), + def_decompressor: def_decompressor.into(), + value_decompressor: value_decompressor.into(), + priority, + rows_in_page, + chunk_meta: Vec::new(), + }) + } + + /// Calculates the overlap between a user-supplied range and a chunk of mini-block data + fn calc_overlap( + range: &mut Range, + chunk: &ChunkMeta, + rows_offset: u64, + dst: &mut ScheduledChunk, + ) -> ChunkOverlap { + if range.start > chunk.num_values + rows_offset { + ChunkOverlap::RangeAfterChunk + } else { + let start_in_chunk = range.start - rows_offset; + let end_in_chunk = (start_in_chunk + range.end - range.start).min(chunk.num_values); + let rows_in_chunk = end_in_chunk - start_in_chunk; + range.start += rows_in_chunk; + dst.ranges.push(start_in_chunk..end_in_chunk); + ChunkOverlap::Overlap + } + } +} + +#[derive(Debug)] +struct ScheduledChunk { + data: LanceBuffer, + vals_in_chunk: u64, + ranges: Vec>, +} + +impl Clone for ScheduledChunk { + fn clone(&self) -> Self { + Self { + data: self.data.try_clone().unwrap(), + vals_in_chunk: self.vals_in_chunk, + ranges: self.ranges.clone(), + } + } +} + +pub enum ChunkOverlap { + RangeAfterChunk, + Overlap, +} + +impl StructuralPageScheduler for MiniBlockScheduler { + fn initialize<'a>(&'a mut self, io: &Arc) -> BoxFuture<'a, Result<()>> { + let metadata = io.submit_single( + self.meta_buf_position..self.meta_buf_position + self.meta_buf_size, + 0, + ); + async move { + let bytes = metadata.await?; + assert!(bytes.len() % 2 == 0); + let mut bytes = LanceBuffer::from_bytes(bytes, 2); + let words = bytes.borrow_to_typed_slice::(); + let words = words.as_ref(); + self.chunk_meta.reserve(words.len()); + let mut rows_counter = 0; + for (word_idx, word) in words.iter().enumerate() { + let log_num_values = word & 0x0F; + let divided_bytes = word >> 4; + let num_bytes = + divided_bytes as u64 * MINIBLOCK_SIZE_MULTIPLIER + MINIBLOCK_SIZE_MULTIPLIER; + debug_assert!(num_bytes > 0); + let num_values = if word_idx < words.len() - 1 { + debug_assert!(log_num_values > 0); + 1 << log_num_values + } else { + debug_assert_eq!(log_num_values, 0); + self.rows_in_page - rows_counter + }; + rows_counter += num_values; + + self.chunk_meta.push(ChunkMeta { + num_values, + chunk_size_bytes: num_bytes, + }); + } + Ok(()) + } + .boxed() + } + + fn schedule_ranges( + &self, + ranges: &[Range], + io: &dyn EncodingsIo, + ) -> Result>>> { + let mut chunk_meta_iter = self.chunk_meta.iter(); + let mut current_chunk = chunk_meta_iter.next().unwrap(); + let mut row_offset = 0; + let mut bytes_offset = 0; + + let mut scheduled_chunks = VecDeque::with_capacity(self.chunk_meta.len()); + let mut ranges_to_req = Vec::with_capacity(self.chunk_meta.len()); + let mut num_rows = 0; + + let mut current_scheduled_chunk = ScheduledChunk { + data: LanceBuffer::empty(), + ranges: Vec::new(), + vals_in_chunk: current_chunk.num_values, + }; + + // There can be both multiple ranges per chunk and multiple chunks per range + for range in ranges { + num_rows += range.end - range.start; + let mut range = range.clone(); + while !range.is_empty() { + Self::calc_overlap( + &mut range, + current_chunk, + row_offset, + &mut current_scheduled_chunk, + ); + // Might be empty if entire chunk is skipped + if !range.is_empty() { + if !current_scheduled_chunk.ranges.is_empty() { + scheduled_chunks.push_back(current_scheduled_chunk); + ranges_to_req.push( + (self.data_buf_position + bytes_offset) + ..(self.data_buf_position + + bytes_offset + + current_chunk.chunk_size_bytes), + ); + } + row_offset += current_chunk.num_values; + bytes_offset += current_chunk.chunk_size_bytes; + if let Some(next_chunk) = chunk_meta_iter.next() { + current_chunk = next_chunk; + } + current_scheduled_chunk = ScheduledChunk { + data: LanceBuffer::empty(), + ranges: Vec::new(), + vals_in_chunk: current_chunk.num_values, + }; + } + } + } + if !current_scheduled_chunk.ranges.is_empty() { + scheduled_chunks.push_back(current_scheduled_chunk); + ranges_to_req.push( + (self.data_buf_position + bytes_offset) + ..(self.data_buf_position + bytes_offset + current_chunk.chunk_size_bytes), + ); + } + + let data = io.submit_request(ranges_to_req, self.priority); + + let rep_decompressor = self.rep_decompressor.clone(); + let def_decompressor = self.def_decompressor.clone(); + let value_decompressor = self.value_decompressor.clone(); + + Ok(async move { + let data = data.await?; + for (chunk, data) in scheduled_chunks.iter_mut().zip(data) { + chunk.data = LanceBuffer::from_bytes(data, 1); + } + Ok(Box::new(MiniBlockDecoder { + rep_decompressor, + def_decompressor, + value_decompressor, + data: scheduled_chunks, + offset_in_current_chunk: 0, + num_rows, + }) as Box) + } + .boxed()) + } +} + +#[derive(Debug)] +struct StructuralPrimitiveFieldSchedulingJob<'a> { + scheduler: &'a StructuralPrimitiveFieldScheduler, + ranges: Vec>, + page_idx: usize, + range_idx: usize, + range_offset: u64, + global_row_offset: u64, +} + +impl<'a> StructuralPrimitiveFieldSchedulingJob<'a> { + pub fn new(scheduler: &'a StructuralPrimitiveFieldScheduler, ranges: Vec>) -> Self { + Self { + scheduler, + ranges, + page_idx: 0, + range_idx: 0, + range_offset: 0, + global_row_offset: 0, + } + } +} + +impl<'a> StructuralSchedulingJob for StructuralPrimitiveFieldSchedulingJob<'a> { + fn schedule_next( + &mut self, + context: &mut SchedulerContext, + ) -> Result> { + if self.range_idx >= self.ranges.len() { + return Ok(None); + } + // Get our current range + let mut range = self.ranges[self.range_idx].clone(); + range.start += self.range_offset; + let priority = range.start; + + let mut cur_page = &self.scheduler.page_schedulers[self.page_idx]; + trace!( + "Current range is {:?} and current page has {} rows", + range, + cur_page.num_rows + ); + // Skip entire pages until we have some overlap with our next range + while cur_page.num_rows + self.global_row_offset <= range.start { + self.global_row_offset += cur_page.num_rows; + self.page_idx += 1; + trace!("Skipping entire page of {} rows", cur_page.num_rows); + cur_page = &self.scheduler.page_schedulers[self.page_idx]; + } + + // Now the cur_page has overlap with range. Continue looping through ranges + // until we find a range that exceeds the current page + + let mut ranges_in_page = Vec::new(); + while cur_page.num_rows + self.global_row_offset > range.start { + range.start = range.start.max(self.global_row_offset); + let start_in_page = range.start - self.global_row_offset; + let end_in_page = start_in_page + (range.end - range.start); + let end_in_page = end_in_page.min(cur_page.num_rows); + let last_in_range = (end_in_page + self.global_row_offset) >= range.end; + + ranges_in_page.push(start_in_page..end_in_page); + if last_in_range { + self.range_idx += 1; + if self.range_idx == self.ranges.len() { + break; + } + range = self.ranges[self.range_idx].clone(); + } else { + break; + } + } + + let num_rows_in_next = ranges_in_page.iter().map(|r| r.end - r.start).sum(); + trace!( + "Scheduling {} rows across {} ranges from page with {} rows (priority={}, column_index={}, page_index={})", + num_rows_in_next, + ranges_in_page.len(), + cur_page.num_rows, + priority, + self.scheduler.column_index, + cur_page.page_index, + ); + + self.global_row_offset += cur_page.num_rows; + self.page_idx += 1; + + let page_decoder = cur_page + .scheduler + .schedule_ranges(&ranges_in_page, context.io().as_ref())?; + + let cur_path = context.current_path(); + let page_index = cur_page.page_index; + let unloaded_page = async move { + let page_decoder = page_decoder.await?; + Ok(LoadedPage { + decoder: page_decoder, + path: cur_path, + page_index, + }) + } + .boxed(); + + Ok(Some(ScheduledScanLine { + decoders: vec![MessageType::UnloadedPage(UnloadedPage(unloaded_page))], + rows_scheduled: num_rows_in_next, + })) + } +} + +#[derive(Debug)] +struct PageInfoAndScheduler { + page_index: usize, + num_rows: u64, + scheduler: Box, +} + +/// A scheduler for a leaf node +/// +/// Here we look at the layout of the various pages and delegate scheduling to a scheduler +/// appropriate for the layout of the page. +#[derive(Debug)] +pub struct StructuralPrimitiveFieldScheduler { + page_schedulers: Vec, + column_index: u32, +} + +impl StructuralPrimitiveFieldScheduler { + pub fn try_new( + column_info: &ColumnInfo, + decompressors: &dyn DecompressorStrategy, + ) -> Result { + let page_schedulers = column_info + .page_infos + .iter() + .enumerate() + .map(|(page_index, page_info)| { + Self::page_info_to_scheduler(page_info, page_index, decompressors) + }) + .collect::>>()?; + Ok(Self { + page_schedulers, + column_index: column_info.index, + }) + } + + fn page_info_to_scheduler( + page_info: &PageInfo, + page_index: usize, + decompressors: &dyn DecompressorStrategy, + ) -> Result { + let scheduler: Box = + match page_info.encoding.as_structural().layout.as_ref() { + Some(pb::page_layout::Layout::MiniBlockLayout(mini_block)) => { + Box::new(MiniBlockScheduler::try_new( + &page_info.buffer_offsets_and_sizes, + page_info.priority, + page_info.num_rows, + mini_block, + decompressors, + )?) + } + Some(pb::page_layout::Layout::AllNullLayout(_)) => { + Box::new(AllNullScheduler::default()) as Box + } + _ => todo!(), + }; + Ok(PageInfoAndScheduler { + page_index, + num_rows: page_info.num_rows, + scheduler, + }) + } +} + +impl StructuralFieldScheduler for StructuralPrimitiveFieldScheduler { + fn initialize<'a>( + &'a mut self, + _filter: &'a FilterExpression, + context: &'a SchedulerContext, + ) -> BoxFuture<'a, Result<()>> { + let page_init = self + .page_schedulers + .iter_mut() + .map(|s| s.scheduler.initialize(context.io())) + .collect::>(); + async move { + page_init.try_collect::>().await?; + Ok(()) + } + .boxed() + } + + fn schedule_ranges<'a>( + &'a self, + ranges: &[Range], + _filter: &FilterExpression, + ) -> Result> { + let ranges = ranges.to_vec(); + Ok(Box::new(StructuralPrimitiveFieldSchedulingJob::new( + self, ranges, + ))) + } +} + pub struct PrimitiveFieldDecoder { data_type: DataType, unloaded_physical_decoder: Option>>>, @@ -396,6 +1121,140 @@ impl LogicalPageDecoder for PrimitiveFieldDecoder { } } +/// Takes the output from several pages decoders and +/// concatenates them. +#[derive(Debug)] +pub struct StructuralCompositeDecodeArrayTask { + tasks: Vec>, + num_values: u64, + data_type: DataType, + should_validate: bool, +} + +impl StructuralDecodeArrayTask for StructuralCompositeDecodeArrayTask { + fn decode(self: Box) -> Result { + let mut arrays = Vec::with_capacity(self.tasks.len()); + let mut all_rep = LevelBuffer::with_capacity(self.num_values as usize); + let mut all_def = LevelBuffer::with_capacity(self.num_values as usize); + let mut offset = 0; + let mut has_def = false; + for task in self.tasks { + let decoded = task.decode()?; + + if let Some(rep) = &decoded.repetition { + // Note: if one chunk has repetition, all chunks will have repetition + // and so all_rep will either end up with len=num_values or len=0 + all_rep.extend(rep); + } + if let Some(def) = &decoded.definition { + if !has_def { + // This is the first validity we have seen, need to backfill with all-valid + // if we've processed any all-valid pages + has_def = true; + all_def.extend(iter::repeat(0).take(offset)); + } + all_def.extend(def); + } + + let array = make_array( + decoded + .data + .into_arrow(self.data_type.clone(), self.should_validate)?, + ); + + offset += array.len(); + arrays.push(array); + } + let array_refs = arrays.iter().map(|arr| arr.as_ref()).collect::>(); + let array = arrow_select::concat::concat(&array_refs)?; + let all_rep = if all_rep.is_empty() { + None + } else { + Some(all_rep) + }; + let all_def = if all_def.is_empty() { + None + } else { + Some(all_def) + }; + let mut repdef = RepDefUnraveler::new(all_rep, all_def); + + // The primitive array itself has a validity + let validity = repdef.unravel_validity(); + if let Some(validity) = validity.as_ref() { + assert!(validity.len() == array.len()); + } + // SAFETY: We are just replacing the validity and asserted it is the correct size + let array = make_array(unsafe { + array + .to_data() + .into_builder() + .nulls(validity) + .build_unchecked() + }); + Ok(DecodedArray { array, repdef }) + } +} + +#[derive(Debug)] +pub struct StructuralPrimitiveFieldDecoder { + field: Arc, + page_decoders: VecDeque>, + should_validate: bool, + rows_drained_in_current: u64, +} + +impl StructuralPrimitiveFieldDecoder { + pub fn new(field: &Arc, should_validate: bool) -> Self { + Self { + field: field.clone(), + page_decoders: VecDeque::new(), + should_validate, + rows_drained_in_current: 0, + } + } +} + +impl StructuralFieldDecoder for StructuralPrimitiveFieldDecoder { + fn accept_page(&mut self, child: LoadedPage) -> Result<()> { + assert!(child.path.is_empty()); + self.page_decoders.push_back(child.decoder); + Ok(()) + } + + fn drain(&mut self, num_rows: u64) -> Result> { + let mut remaining = num_rows; + let mut tasks = Vec::new(); + while remaining > 0 { + let cur_page = self.page_decoders.front_mut().unwrap(); + let num_in_page = cur_page.num_rows() - self.rows_drained_in_current; + let to_take = num_in_page.min(remaining); + + let task = cur_page.drain(to_take)?; + tasks.push(task); + + if to_take == num_in_page { + self.page_decoders.pop_front(); + self.rows_drained_in_current = 0; + } else { + self.rows_drained_in_current += to_take; + } + + remaining -= to_take; + } + Ok(Box::new(StructuralCompositeDecodeArrayTask { + tasks, + data_type: self.field.data_type().clone(), + should_validate: self.should_validate, + num_values: num_rows, + })) + } + + fn data_type(&self) -> &DataType { + self.field.data_type() + } +} + #[derive(Debug)] pub struct AccumulationQueue { cache_bytes: u64, @@ -606,7 +1465,7 @@ impl FieldEncoder for PrimitiveFieldEncoder { // 4KiB which is too small. As a compromise we divide the size by this // constant which gives us up to 24KiB be introduces some padding into each // miniblock. We want 24KiB so we can handle even the worst case of -// - 4Ki values compressed into an 8188 byte buffer +// - 4Ki values compressed into an 8186 byte buffer // - 4 bytes to describe rep & def lengths // - 16KiB of rep & def buffer (this will almost never happen) const MINIBLOCK_SIZE_MULTIPLIER: u64 = 6; @@ -675,9 +1534,9 @@ impl PrimitiveStructuralEncoder { // As data gets wide then the # of values per block shrinks (very wide) // data doesn't even fit in a mini-block and the block overhead gets // too large and we prefer zipped. - fn is_narrow(num_rows: u64, num_bytes: u64) -> bool { - let avg_bytes_per_row = num_bytes as f64 / num_rows as f64; - avg_bytes_per_row < 128.0 + fn is_narrow(arrays: &[ArrayRef], data_type: &DataType) -> bool { + let avg_bytes_per_row = Self::get_avg_value_size(arrays, data_type); + avg_bytes_per_row < 128 } // Converts value data, repetition levels, and definition levels into a single @@ -685,7 +1544,7 @@ impl PrimitiveStructuralEncoder { // which tells us the size of each block. // // Each chunk is serialized as: - // | rep_len (2 bytes) | def_len (2 bytes) | rep | def | values | + // | rep_len (2 bytes) | def_len (2 bytes) | values_len (2 byte) | rep | def | values | // // Each block has a u16 word of metadata. The upper 12 bits contain 1/6 the // # of bytes in the block (if the block does not have an even number of bytes @@ -724,25 +1583,28 @@ impl PrimitiveStructuralEncoder { let mut value_offset = 0; for ((chunk, rep), def) in miniblocks.chunks.into_iter().zip(rep).zip(def) { - let chunk_bytes = chunk.num_bytes as u64 + rep.len() as u64 + def.len() as u64 + 4; + let chunk_bytes = chunk.num_bytes as u64 + rep.len() as u64 + def.len() as u64 + 6; assert!(chunk_bytes <= 16 * 1024); assert!(chunk_bytes > 0); // We subtract 1 here from chunk_bytes because we want to be able to express // a size of 24KiB and not (24Ki - 6)B which is what we'd get otherwise with // 0xFFF - let divided_bytes = (chunk_bytes - 1).div_ceil(MINIBLOCK_SIZE_MULTIPLIER); - let pad_bytes = (MINIBLOCK_SIZE_MULTIPLIER * divided_bytes) - (chunk_bytes - 1); + let divided_bytes = chunk_bytes.div_ceil(MINIBLOCK_SIZE_MULTIPLIER); + let pad_bytes = (MINIBLOCK_SIZE_MULTIPLIER * divided_bytes) - chunk_bytes; + let divided_bytes_minus_one = divided_bytes - 1; - let metadata = ((divided_bytes << 4) | chunk.log_num_values as u64) as u16; + let metadata = ((divided_bytes_minus_one << 4) | chunk.log_num_values as u64) as u16; meta_buffer.extend_from_slice(&metadata.to_le_bytes()); assert!(rep.len() < u16::MAX as usize); assert!(def.len() < u16::MAX as usize); let bytes_rep = rep.len() as u16; let bytes_def = def.len() as u16; + let bytes_val = chunk.num_bytes; data_buffer.extend_from_slice(&bytes_rep.to_le_bytes()); data_buffer.extend_from_slice(&bytes_def.to_le_bytes()); + data_buffer.extend_from_slice(&bytes_val.to_le_bytes()); data_buffer.extend_from_slice(&rep); data_buffer.extend_from_slice(&def); @@ -818,6 +1680,17 @@ impl PrimitiveStructuralEncoder { } } + fn encode_all_null(column_idx: u32, num_rows: u64, row_number: u64) -> Result { + let description = ProtobufUtils::all_null_layout(); + Ok(EncodedPage { + column_idx, + data: vec![], + description: PageEncoding::Structural(description), + num_rows, + row_number, + }) + } + fn encode_miniblock( column_idx: u32, field: &Field, @@ -859,7 +1732,8 @@ impl PrimitiveStructuralEncoder { let (block_value_buffer, block_meta_buffer) = Self::serialize_miniblocks(compressed_data, compressed_rep, compressed_def); - let description = ProtobufUtils::miniblock(rep_encoding, def_encoding, value_encoding); + let description = + ProtobufUtils::miniblock_layout(rep_encoding, def_encoding, value_encoding); Ok(EncodedPage { num_rows: num_values, column_idx, @@ -869,6 +1743,17 @@ impl PrimitiveStructuralEncoder { }) } + fn get_avg_value_size(_arrays: &[ArrayRef], data_type: &DataType) -> u64 { + // Simple types, we can infer avg size without looking at value + let byte_width = data_type.byte_width_opt(); + if let Some(byte_width) = byte_width { + return byte_width as u64; + } + + // Other types, we need to inspect buffers + todo!() + } + // Creates an encode task, consuming all buffered data fn do_flush( &mut self, @@ -881,14 +1766,14 @@ impl PrimitiveStructuralEncoder { let field = self.field.clone(); let task = spawn_cpu(move || { let num_values = arrays.iter().map(|arr| arr.len() as u64).sum(); - let num_bytes = arrays + let num_nulls = arrays .iter() - .map(|arr| arr.get_buffer_memory_size() as u64) - .sum(); - - // TODO: Calculation of statistics that can be used to choose compression algorithm + .map(|arr| arr.logical_nulls().map(|n| n.null_count()).unwrap_or(0) as u64) + .sum::(); - if Self::is_narrow(num_values, num_bytes) { + if num_values == num_nulls { + Self::encode_all_null(column_idx, num_values, row_number) + } else if Self::is_narrow(&arrays, &field.data_type()) { Self::encode_miniblock( column_idx, &field, diff --git a/rust/lance-encoding/src/encodings/logical/struct.rs b/rust/lance-encoding/src/encodings/logical/struct.rs index cac8b3e8ca..a4cc44afc7 100644 --- a/rust/lance-encoding/src/encodings/logical/struct.rs +++ b/rust/lance-encoding/src/encodings/logical/struct.rs @@ -20,9 +20,10 @@ use snafu::{location, Location}; use crate::{ decoder::{ - DecodeArrayTask, DecoderReady, FieldScheduler, FilterExpression, LogicalPageDecoder, - NextDecodeTask, PageEncoding, PriorityRange, ScheduledScanLine, SchedulerContext, - SchedulingJob, + DecodeArrayTask, DecodedArray, DecoderReady, FieldScheduler, FilterExpression, LoadedPage, + LogicalPageDecoder, MessageType, NextDecodeTask, PageEncoding, PriorityRange, + ScheduledScanLine, SchedulerContext, SchedulingJob, StructuralDecodeArrayTask, + StructuralFieldDecoder, StructuralFieldScheduler, StructuralSchedulingJob, }, encoder::{EncodeTask, EncodedColumn, EncodedPage, FieldEncoder, OutOfLineBuffers}, format::pb, @@ -30,6 +31,8 @@ use crate::{ }; use lance_core::{Error, Result}; +use super::primitive::StructuralPrimitiveFieldDecoder; + #[derive(Debug)] struct SchedulingJobWithStatus<'a> { col_idx: u32, @@ -118,7 +121,7 @@ impl<'a> SchedulingJob for SimpleStructSchedulerJob<'a> { self.num_rows, )); let struct_decoder = context.locate_decoder(struct_decoder); - decoders.push(struct_decoder); + decoders.push(MessageType::DecoderReady(struct_decoder)); self.initialized = true; } let old_rows_scheduled = self.rows_scheduled; @@ -227,6 +230,181 @@ impl FieldScheduler for SimpleStructScheduler { } } +#[derive(Debug)] +struct StructuralSchedulingJobWithStatus<'a> { + col_idx: u32, + col_name: &'a str, + job: Box, + rows_scheduled: u64, + rows_remaining: u64, +} + +impl<'a> PartialEq for StructuralSchedulingJobWithStatus<'a> { + fn eq(&self, other: &Self) -> bool { + self.col_idx == other.col_idx + } +} + +impl<'a> Eq for StructuralSchedulingJobWithStatus<'a> {} + +impl<'a> PartialOrd for StructuralSchedulingJobWithStatus<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> Ord for StructuralSchedulingJobWithStatus<'a> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Note this is reversed to make it min-heap + other.rows_scheduled.cmp(&self.rows_scheduled) + } +} + +/// Scheduling job for struct data +/// +/// The order in which we schedule the children is important. We want to schedule the child +/// with the least amount of data first. +/// +/// This allows us to decode entire rows as quickly as possible +#[derive(Debug)] +struct RepDefStructSchedulingJob<'a> { + /// A min-heap whose key is the # of rows currently scheduled + children: BinaryHeap>, + rows_scheduled: u64, +} + +impl<'a> RepDefStructSchedulingJob<'a> { + fn new( + scheduler: &'a StructuralStructScheduler, + children: Vec>, + num_rows: u64, + ) -> Self { + let children = children + .into_iter() + .enumerate() + .map(|(idx, job)| StructuralSchedulingJobWithStatus { + col_idx: idx as u32, + col_name: scheduler.child_fields[idx].name(), + job, + rows_scheduled: 0, + rows_remaining: num_rows, + }) + .collect::>(); + Self { + children, + rows_scheduled: 0, + } + } +} + +impl<'a> StructuralSchedulingJob for RepDefStructSchedulingJob<'a> { + fn schedule_next( + &mut self, + mut context: &mut SchedulerContext, + ) -> Result> { + let mut decoders = Vec::new(); + let old_rows_scheduled = self.rows_scheduled; + // Schedule as many children as we need to until we have scheduled at least one + // complete row + while old_rows_scheduled == self.rows_scheduled { + let mut next_child = self.children.pop().unwrap(); + let scoped = context.push(next_child.col_name, next_child.col_idx); + let child_scan = next_child.job.schedule_next(scoped.context)?; + // next_child is the least-scheduled child and, if it's done, that + // means we are completely done. + if child_scan.is_none() { + return Ok(None); + } + let child_scan = child_scan.unwrap(); + + trace!( + "Scheduled {} rows for child {}", + child_scan.rows_scheduled, + next_child.col_idx + ); + next_child.rows_scheduled += child_scan.rows_scheduled; + next_child.rows_remaining -= child_scan.rows_scheduled; + decoders.extend(child_scan.decoders); + self.children.push(next_child); + self.rows_scheduled = self.children.peek().unwrap().rows_scheduled; + context = scoped.pop(); + } + let struct_rows_scheduled = self.rows_scheduled - old_rows_scheduled; + Ok(Some(ScheduledScanLine { + decoders, + rows_scheduled: struct_rows_scheduled, + })) + } +} + +/// A scheduler for structs +/// +/// The implementation is actually a bit more tricky than one might initially think. We can't just +/// go through and schedule each column one after the other. This would mean our decode can't start +/// until nearly all the data has arrived (since we need data from each column to yield a batch) +/// +/// Instead, we schedule in row-major fashion +/// +/// Note: this scheduler is the starting point for all decoding. This is because we treat the top-level +/// record batch as a non-nullable struct. +#[derive(Debug)] +pub struct StructuralStructScheduler { + children: Vec>, + child_fields: Fields, +} + +impl StructuralStructScheduler { + pub fn new(children: Vec>, child_fields: Fields) -> Self { + debug_assert!(!children.is_empty()); + Self { + children, + child_fields, + } + } +} + +impl StructuralFieldScheduler for StructuralStructScheduler { + fn schedule_ranges<'a>( + &'a self, + ranges: &[Range], + filter: &FilterExpression, + ) -> Result> { + let num_rows = ranges.iter().map(|r| r.end - r.start).sum(); + + let child_schedulers = self + .children + .iter() + .map(|child| child.schedule_ranges(ranges, filter)) + .collect::>>()?; + + Ok(Box::new(RepDefStructSchedulingJob::new( + self, + child_schedulers, + num_rows, + ))) + } + + fn initialize<'a>( + &'a mut self, + filter: &'a FilterExpression, + context: &'a SchedulerContext, + ) -> BoxFuture<'a, Result<()>> { + let children_initialization = self + .children + .iter_mut() + .map(|child| child.initialize(filter, context)) + .collect::>(); + async move { + children_initialization + .map(|res| res.map(|_| ())) + .try_collect::>() + .await?; + Ok(()) + } + .boxed() + } +} + #[derive(Debug)] struct ChildState { // As child decoders are scheduled they are added to this queue @@ -400,6 +578,102 @@ impl PartialOrd for WaitOrder<'_> { } } +#[derive(Debug)] +pub struct StructuralStructDecoder { + children: Vec>, + data_type: DataType, + child_fields: Fields, +} + +impl StructuralStructDecoder { + pub fn new(fields: Fields, should_validate: bool) -> Self { + let children = fields + .iter() + .map(|field| Self::field_to_decoder(field, should_validate)) + .collect(); + let data_type = DataType::Struct(fields.clone()); + Self { + data_type, + children, + child_fields: fields, + } + } + + fn field_to_decoder( + field: &Arc, + should_validate: bool, + ) -> Box { + match field.data_type() { + DataType::Struct(fields) => Box::new(Self::new(fields.clone(), should_validate)), + DataType::List(_) | DataType::LargeList(_) => todo!(), + DataType::RunEndEncoded(_, _) => todo!(), + DataType::ListView(_) | DataType::LargeListView(_) => todo!(), + DataType::Map(_, _) => todo!(), + DataType::Union(_, _) => todo!(), + _ => Box::new(StructuralPrimitiveFieldDecoder::new(field, should_validate)), + } + } +} + +impl StructuralFieldDecoder for StructuralStructDecoder { + fn accept_page(&mut self, mut child: LoadedPage) -> Result<()> { + // children with empty path should not be delivered to this method + let child_idx = child.path.pop_front().unwrap(); + // This decoder is intended for one of our children + self.children[child_idx as usize].accept_page(child)?; + Ok(()) + } + + fn drain(&mut self, num_rows: u64) -> Result> { + let child_tasks = self + .children + .iter_mut() + .map(|child| child.drain(num_rows)) + .collect::>>()?; + Ok(Box::new(RepDefStructDecodeTask { + children: child_tasks, + child_fields: self.child_fields.clone(), + })) + } + + fn data_type(&self) -> &DataType { + &self.data_type + } +} + +#[derive(Debug)] +struct RepDefStructDecodeTask { + children: Vec>, + child_fields: Fields, +} + +impl StructuralDecodeArrayTask for RepDefStructDecodeTask { + fn decode(self: Box) -> Result { + let arrays = self + .children + .into_iter() + .map(|task| task.decode()) + .collect::>>()?; + let mut children = Vec::with_capacity(arrays.len()); + let mut arrays_iter = arrays.into_iter(); + let first_array = arrays_iter.next().unwrap(); + + // The repdef should be identical across all children at this point + let mut repdef = first_array.repdef; + children.push(first_array.array); + for array in arrays_iter { + children.push(array.array); + } + + let validity = repdef.unravel_validity(); + let array = StructArray::new(self.child_fields, children, validity); + Ok(DecodedArray { + array: Arc::new(array), + repdef, + }) + } +} + #[derive(Debug)] pub struct SimpleStructDecoder { children: Vec, @@ -716,10 +990,12 @@ mod tests { builder::{Int32Builder, ListBuilder}, Array, ArrayRef, Int32Array, StructArray, }; + use arrow_buffer::NullBuffer; use arrow_schema::{DataType, Field, Fields}; - use crate::testing::{ - check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases, + use crate::{ + testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; #[test_log::test(tokio::test)] @@ -729,7 +1005,59 @@ mod tests { Field::new("b", DataType::Int32, false), ])); let field = Field::new("", data_type, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; + } + + #[test_log::test(tokio::test)] + async fn test_nullable_struct() { + // Test data struct> + // - score: null + // location: + // x: 1 + // y: 6 + // - score: 12 + // location: + // x: 2 + // y: null + // - score: 13 + // location: + // x: 3 + // y: 8 + // - score: 14 + // location: null + // - null + // + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, true), + ]); + let inner_struct = DataType::Struct(inner_fields.clone()); + let outer_fields = Fields::from(vec![ + Field::new("score", DataType::Int32, true), + Field::new("location", inner_struct, true), + ]); + + let x_vals = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]); + let y_vals = Int32Array::from(vec![Some(6), None, Some(8), Some(9), Some(10)]); + let scores = Int32Array::from(vec![None, Some(12), Some(13), Some(14), Some(15)]); + + let location_validity = NullBuffer::from(vec![true, true, true, false, true]); + let locations = StructArray::new( + inner_fields, + vec![Arc::new(x_vals), Arc::new(y_vals)], + Some(location_validity), + ); + + let rows_validity = NullBuffer::from(vec![true, true, true, true, false]); + let rows = StructArray::new( + outer_fields, + vec![Arc::new(scores), Arc::new(locations)], + Some(rows_validity), + ); + + let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1); + + check_round_trip_encoding_of_data(vec![Arc::new(rows)], &test_cases, HashMap::new()).await; } #[test_log::test(tokio::test)] @@ -743,7 +1071,7 @@ mod tests { Field::new("outer_int", DataType::Int32, true), ])); let field = Field::new("row", data_type, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] @@ -765,7 +1093,7 @@ mod tests { Field::new("outer_binary", DataType::Binary, true), ])); let field = Field::new("row", data_type, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] diff --git a/rust/lance-encoding/src/encodings/physical/binary.rs b/rust/lance-encoding/src/encodings/physical/binary.rs index 91dc5f133d..9964db9b38 100644 --- a/rust/lance-encoding/src/encodings/physical/binary.rs +++ b/rust/lance-encoding/src/encodings/physical/binary.rs @@ -539,6 +539,7 @@ pub mod tests { buffer::LanceBuffer, data::DataBlock, testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; use super::get_indices_from_string_arrays; @@ -546,7 +547,7 @@ pub mod tests { #[test_log::test(tokio::test)] async fn test_utf8_binary() { let field = Field::new("", DataType::Utf8, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test] @@ -583,19 +584,19 @@ pub mod tests { #[test_log::test(tokio::test)] async fn test_binary() { let field = Field::new("", DataType::Binary, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_large_binary() { let field = Field::new("", DataType::LargeBinary, true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_large_utf8() { let field = Field::new("", DataType::LargeUtf8, true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] diff --git a/rust/lance-encoding/src/encodings/physical/bitmap.rs b/rust/lance-encoding/src/encodings/physical/bitmap.rs index 2f30a95beb..4a6c88969b 100644 --- a/rust/lance-encoding/src/encodings/physical/bitmap.rs +++ b/rust/lance-encoding/src/encodings/physical/bitmap.rs @@ -133,13 +133,14 @@ mod tests { use crate::decoder::PrimitivePageDecoder; use crate::encodings::physical::bitmap::BitmapData; use crate::testing::check_round_trip_encoding_random; + use crate::version::LanceFileVersion; use super::BitmapDecoder; #[test_log::test(tokio::test)] async fn test_bitmap_boolean() { let field = Field::new("", DataType::Boolean, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test] diff --git a/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs b/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs index 2c14f06888..8900418e68 100644 --- a/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs +++ b/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs @@ -597,966 +597,966 @@ fn bitpacked_for_non_neg_decode( #[cfg(test)] mod tests { - use super::*; - use arrow::array::{ - Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }; - use arrow::datatypes::DataType; - - #[test_log::test(tokio::test)] - async fn test_compute_compressed_bit_width_for_non_neg() {} - - use std::collections::HashMap; - - use lance_datagen::RowCount; - - use crate::testing::{check_round_trip_encoding_of_data, TestCases}; - use crate::version::LanceFileVersion; - - async fn check_round_trip_bitpacked(array: Arc) { - let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1); - check_round_trip_encoding_of_data(vec![array], &test_cases, HashMap::new()).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_u8() { - let values: Vec = vec![5; 1024]; - let array = UInt8Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = UInt8Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = UInt8Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = UInt8Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = UInt8Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_u16() { - let values: Vec = vec![5; 1024]; - let array = UInt16Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = UInt16Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = UInt16Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = UInt16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = UInt16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![300; 100]; - let array = UInt16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![800; 100]; - let array = UInt16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_u32() { - let values: Vec = vec![5; 1024]; - let array = UInt32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![7; 2000]; - let array = UInt32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = UInt32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![666; 1000]; - let array = UInt32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = UInt32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![1; 10000]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![300; 100]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![3000; 100]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![800; 100]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![8000; 100]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![65536; 100]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![655360; 100]; - let array = UInt32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_u64() { - let values: Vec = vec![5; 1024]; - let array = UInt64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![7; 2000]; - let array = UInt64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = UInt64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![666; 1000]; - let array = UInt64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = UInt64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![1; 10000]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![300; 100]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![3000; 100]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![800; 100]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![8000; 100]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![65536; 100]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![655360; 100]; - let array = UInt64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_i8() { - let values: Vec = vec![-5; 1024]; - let array = Int8Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = Int8Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = Int8Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = Int8Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = Int8Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-88; 10000]; - let array = Int8Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_i16() { - let values: Vec = vec![-5; 1024]; - let array = Int16Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = Int16Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = Int16Array::from(values); - let array: Arc = Arc::new(array); - - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = Int16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = Int16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![300; 100]; - let array = Int16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![800; 100]; - let array = Int16Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_i32() { - let values: Vec = vec![-5; 1024]; - let array = Int32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = Int32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![-66; 1000]; - let array = Int32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = Int32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![-77; 2000]; - let array = Int32Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-88; 10000]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![300; 100]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-300; 100]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![800; 100]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-800; 100]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![65536; 100]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-65536; 100]; - let array = Int32Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } - - #[test_log::test(tokio::test)] - async fn test_bitpack_fastlanes_i64() { - let values: Vec = vec![-5; 1024]; - let array = Int64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![66; 1000]; - let array = Int64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![-66; 1000]; - let array = Int64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![77; 2000]; - let array = Int64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![-77; 2000]; - let array = Int64Array::from(values); - let array: Arc = Arc::new(array); - check_round_trip_bitpacked(array).await; - - let values: Vec = vec![0; 10000]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![88; 10000]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-88; 10000]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![300; 100]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-300; 100]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![800; 100]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-800; 100]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![65536; 100]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let values: Vec = vec![-65536; 100]; - let array = Int64Array::from(values); - let arr = Arc::new(array) as ArrayRef; - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(1)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(20)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(50)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(100)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(1000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(1024)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(2000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - - let arr = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) - .into_batch_rows(RowCount::from(3000)) - .unwrap() - .column(0) - .clone(); - check_round_trip_bitpacked(arr).await; - } + // use super::*; + // use arrow::array::{ + // Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, + // UInt8Array, + // }; + // use arrow::datatypes::DataType; + + // #[test_log::test(tokio::test)] + // async fn test_compute_compressed_bit_width_for_non_neg() {} + + // use std::collections::HashMap; + + // use lance_datagen::RowCount; + + // use crate::testing::{check_round_trip_encoding_of_data, TestCases}; + // use crate::version::LanceFileVersion; + + // async fn check_round_trip_bitpacked(array: Arc) { + // let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1); + // check_round_trip_encoding_of_data(vec![array], &test_cases, HashMap::new()).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_u8() { + // let values: Vec = vec![5; 1024]; + // let array = UInt8Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = UInt8Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = UInt8Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = UInt8Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = UInt8Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_u16() { + // let values: Vec = vec![5; 1024]; + // let array = UInt16Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = UInt16Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = UInt16Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = UInt16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = UInt16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![300; 100]; + // let array = UInt16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![800; 100]; + // let array = UInt16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_u32() { + // let values: Vec = vec![5; 1024]; + // let array = UInt32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![7; 2000]; + // let array = UInt32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = UInt32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![666; 1000]; + // let array = UInt32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = UInt32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![1; 10000]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![300; 100]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![3000; 100]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![800; 100]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![8000; 100]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![65536; 100]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![655360; 100]; + // let array = UInt32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_u64() { + // let values: Vec = vec![5; 1024]; + // let array = UInt64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![7; 2000]; + // let array = UInt64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = UInt64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![666; 1000]; + // let array = UInt64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = UInt64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![1; 10000]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![300; 100]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![3000; 100]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![800; 100]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![8000; 100]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![65536; 100]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![655360; 100]; + // let array = UInt64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_i8() { + // let values: Vec = vec![-5; 1024]; + // let array = Int8Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = Int8Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = Int8Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = Int8Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = Int8Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-88; 10000]; + // let array = Int8Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_i16() { + // let values: Vec = vec![-5; 1024]; + // let array = Int16Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = Int16Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = Int16Array::from(values); + // let array: Arc = Arc::new(array); + + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = Int16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = Int16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![300; 100]; + // let array = Int16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![800; 100]; + // let array = Int16Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_i32() { + // let values: Vec = vec![-5; 1024]; + // let array = Int32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = Int32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![-66; 1000]; + // let array = Int32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = Int32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![-77; 2000]; + // let array = Int32Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-88; 10000]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![300; 100]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-300; 100]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![800; 100]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-800; 100]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![65536; 100]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-65536; 100]; + // let array = Int32Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } + + // #[test_log::test(tokio::test)] + // async fn test_bitpack_fastlanes_i64() { + // let values: Vec = vec![-5; 1024]; + // let array = Int64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![66; 1000]; + // let array = Int64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![-66; 1000]; + // let array = Int64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![77; 2000]; + // let array = Int64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![-77; 2000]; + // let array = Int64Array::from(values); + // let array: Arc = Arc::new(array); + // check_round_trip_bitpacked(array).await; + + // let values: Vec = vec![0; 10000]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![88; 10000]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-88; 10000]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![300; 100]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-300; 100]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![800; 100]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-800; 100]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![65536; 100]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let values: Vec = vec![-65536; 100]; + // let array = Int64Array::from(values); + // let arr = Arc::new(array) as ArrayRef; + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(1)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(20)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(50)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(100)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(1000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(1024)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(2000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + + // let arr = lance_datagen::gen() + // .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + // .into_batch_rows(RowCount::from(3000)) + // .unwrap() + // .column(0) + // .clone(); + // check_round_trip_bitpacked(arr).await; + // } } diff --git a/rust/lance-encoding/src/encodings/physical/dictionary.rs b/rust/lance-encoding/src/encodings/physical/dictionary.rs index 133eeb5857..21933bf29f 100644 --- a/rust/lance-encoding/src/encodings/physical/dictionary.rs +++ b/rust/lance-encoding/src/encodings/physical/dictionary.rs @@ -419,8 +419,9 @@ pub mod tests { use arrow_schema::{DataType, Field}; use std::{collections::HashMap, sync::Arc, vec}; - use crate::testing::{ - check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases, + use crate::{ + testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; use super::encode_dict_indices_and_items; @@ -452,25 +453,25 @@ pub mod tests { #[test_log::test(tokio::test)] async fn test_utf8() { let field = Field::new("", DataType::Utf8, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_binary() { let field = Field::new("", DataType::Binary, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_large_binary() { let field = Field::new("", DataType::LargeBinary, true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_large_utf8() { let field = Field::new("", DataType::LargeUtf8, true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] @@ -576,6 +577,6 @@ pub mod tests { DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), false, ); - check_round_trip_encoding_random(dict_field).await; + check_round_trip_encoding_random(dict_field, LanceFileVersion::V2_0).await; } } diff --git a/rust/lance-encoding/src/encodings/physical/fixed_size_binary.rs b/rust/lance-encoding/src/encodings/physical/fixed_size_binary.rs index 5eca735a31..3a19560a9a 100644 --- a/rust/lance-encoding/src/encodings/physical/fixed_size_binary.rs +++ b/rust/lance-encoding/src/encodings/physical/fixed_size_binary.rs @@ -172,33 +172,34 @@ mod tests { use arrow_array::{ArrayRef, LargeStringArray, StringArray}; use arrow_schema::{DataType, Field}; - use crate::testing::{ - check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases, + use crate::{ + testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; #[test_log::test(tokio::test)] async fn test_fixed_size_utf8_binary() { let field = Field::new("", DataType::Utf8, false); // This test only generates fixed size binary arrays anyway - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_fixed_size_binary() { let field = Field::new("", DataType::Binary, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_fixed_size_large_binary() { let field = Field::new("", DataType::LargeBinary, true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] async fn test_fixed_size_large_utf8() { let field = Field::new("", DataType::LargeUtf8, true); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] diff --git a/rust/lance-encoding/src/encodings/physical/fixed_size_list.rs b/rust/lance-encoding/src/encodings/physical/fixed_size_list.rs index 535a7ba207..a6c63d0c40 100644 --- a/rust/lance-encoding/src/encodings/physical/fixed_size_list.rs +++ b/rust/lance-encoding/src/encodings/physical/fixed_size_list.rs @@ -134,7 +134,7 @@ mod tests { use arrow_schema::{DataType, Field}; - use crate::testing::check_round_trip_encoding_random; + use crate::{testing::check_round_trip_encoding_random, version::LanceFileVersion}; const PRIMITIVE_TYPES: &[DataType] = &[DataType::Int8, DataType::Float32, DataType::Float64]; @@ -144,7 +144,7 @@ mod tests { let inner_field = Field::new("item", data_type.clone(), true); let data_type = DataType::FixedSizeList(Arc::new(inner_field), 16); let field = Field::new("", data_type, false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } } } diff --git a/rust/lance-encoding/src/encodings/physical/packed_struct.rs b/rust/lance-encoding/src/encodings/physical/packed_struct.rs index 78927caa4a..ab87913baa 100644 --- a/rust/lance-encoding/src/encodings/physical/packed_struct.rs +++ b/rust/lance-encoding/src/encodings/physical/packed_struct.rs @@ -265,8 +265,9 @@ pub mod tests { use arrow_schema::{DataType, Field, Fields}; use std::{collections::HashMap, sync::Arc, vec}; - use crate::testing::{ - check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases, + use crate::{ + testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, }; #[test_log::test(tokio::test)] @@ -280,7 +281,7 @@ pub mod tests { let field = Field::new("", data_type, false).with_metadata(metadata); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await; } #[test_log::test(tokio::test)] diff --git a/rust/lance-encoding/src/encodings/physical/value.rs b/rust/lance-encoding/src/encodings/physical/value.rs index d549a0b031..6e2355031e 100644 --- a/rust/lance-encoding/src/encodings/physical/value.rs +++ b/rust/lance-encoding/src/encodings/physical/value.rs @@ -11,12 +11,13 @@ use std::ops::Range; use std::sync::{Arc, Mutex}; use crate::buffer::LanceBuffer; -use crate::data::{BlockInfo, DataBlock, FixedWidthDataBlock, UsedEncoding}; +use crate::data::{BlockInfo, ConstantDataBlock, DataBlock, FixedWidthDataBlock, UsedEncoding}; +use crate::decoder::{BlockDecompressor, FixedPerValueDecompressor, MiniBlockDecompressor}; use crate::encoder::{ BlockCompressor, FixedPerValueCompressor, MiniBlockChunk, MiniBlockCompressed, MiniBlockCompressor, MAX_MINIBLOCK_BYTES, MAX_MINIBLOCK_VALUES, }; -use crate::format::pb::ArrayEncoding; +use crate::format::pb::{self, ArrayEncoding}; use crate::format::ProtobufUtils; use crate::{ decoder::{PageScheduler, PrimitivePageDecoder}, @@ -352,6 +353,85 @@ impl MiniBlockCompressor for ValueEncoder { } } +/// A decompressor for constant-encoded data +#[derive(Debug)] +pub struct ConstantDecompressor { + scalar: LanceBuffer, + num_values: u64, +} + +impl ConstantDecompressor { + pub fn new(scalar: LanceBuffer, num_values: u64) -> Self { + Self { + scalar: scalar.into_borrowed(), + num_values, + } + } +} + +impl BlockDecompressor for ConstantDecompressor { + fn decompress(&self, _data: LanceBuffer) -> Result { + Ok(DataBlock::Constant(ConstantDataBlock { + data: self.scalar.try_clone().unwrap(), + num_values: self.num_values, + })) + } +} + +/// A decompressor for fixed-width data that has +/// been written, as-is, to disk in single contiguous array +#[derive(Debug)] +pub struct ValueDecompressor { + bytes_per_value: u64, +} + +impl ValueDecompressor { + pub fn new(description: &pb::Flat) -> Self { + assert!(description.bits_per_value % 8 == 0); + Self { + bytes_per_value: description.bits_per_value / 8, + } + } +} + +impl BlockDecompressor for ValueDecompressor { + fn decompress(&self, data: LanceBuffer) -> Result { + let num_values = data.len() as u64 / self.bytes_per_value; + assert_eq!(data.len() as u64 % self.bytes_per_value, 0); + Ok(DataBlock::FixedWidth(FixedWidthDataBlock { + bits_per_value: self.bytes_per_value * 8, + data, + num_values, + block_info: BlockInfo::new(), + used_encoding: UsedEncoding::new(), + })) + } +} + +impl MiniBlockDecompressor for ValueDecompressor { + fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result { + debug_assert!(data.len() as u64 >= num_values * self.bytes_per_value); + + Ok(DataBlock::FixedWidth(FixedWidthDataBlock { + data, + bits_per_value: self.bytes_per_value * 8, + num_values, + block_info: BlockInfo::new(), + used_encoding: UsedEncoding::new(), + })) + } +} + +impl FixedPerValueDecompressor for ValueDecompressor { + fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result { + MiniBlockDecompressor::decompress(self, data, num_values) + } + + fn bits_per_value(&self) -> u64 { + self.bytes_per_value * 8 + } +} + impl FixedPerValueCompressor for ValueEncoder { fn compress(&self, data: DataBlock) -> Result<(FixedWidthDataBlock, ArrayEncoding)> { let (data, encoding) = match data { @@ -372,9 +452,16 @@ impl FixedPerValueCompressor for ValueEncoder { #[cfg(test)] pub(crate) mod tests { + use std::{collections::HashMap, sync::Arc}; + + use arrow_array::{Array, Int32Array}; use arrow_schema::{DataType, Field, TimeUnit}; + use rstest::rstest; - use crate::testing::check_round_trip_encoding_random; + use crate::{ + testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases}, + version::LanceFileVersion, + }; const PRIMITIVE_TYPES: &[DataType] = &[ DataType::Null, @@ -403,12 +490,64 @@ pub(crate) mod tests { // DataType::Interval(IntervalUnit::DayTime), ]; + #[rstest] #[test_log::test(tokio::test)] - async fn test_value_primitive() { + async fn test_value_primitive( + #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion, + ) { for data_type in PRIMITIVE_TYPES { log::info!("Testing encoding for {:?}", data_type); let field = Field::new("", data_type.clone(), false); - check_round_trip_encoding_random(field).await; + check_round_trip_encoding_random(field, version).await; + } + } + + #[test_log::test(tokio::test)] + async fn test_miniblock_stress() { + // Tests for strange page sizes and batch sizes and validity scenarios for miniblock + + // 10K integers, 100 per array, all valid + let data1 = (0..100) + .map(|_| Arc::new(Int32Array::from_iter_values(0..100)) as Arc) + .collect::>(); + + // Same as above but with mixed validity + let data2 = (0..100) + .map(|_| { + Arc::new(Int32Array::from_iter((0..100).map(|i| { + if i % 2 == 0 { + Some(i) + } else { + None + } + }))) as Arc + }) + .collect::>(); + + // Same as above but with all null for first half then all valid + // TODO: Re-enable once the all-null path is complete + let _data3 = (0..100) + .map(|chunk_idx| { + Arc::new(Int32Array::from_iter((0..100).map(|i| { + if chunk_idx < 50 { + None + } else { + Some(i) + } + }))) as Arc + }) + .collect::>(); + + for data in [data1, data2 /*data3*/] { + for batch_size in [10, 100, 1500, 15000] { + // 40000 bytes of data + let test_cases = TestCases::default() + .with_page_sizes(vec![1000, 2000, 3000, 60000]) + .with_batch_size(batch_size) + .with_file_version(LanceFileVersion::V2_1); + + check_round_trip_encoding_of_data(data.clone(), &test_cases, HashMap::new()).await; + } } } } diff --git a/rust/lance-encoding/src/format.rs b/rust/lance-encoding/src/format.rs index a34bc1d931..295841b645 100644 --- a/rust/lance-encoding/src/format.rs +++ b/rust/lance-encoding/src/format.rs @@ -19,8 +19,9 @@ use pb::{ buffer::BufferType, nullable::{AllNull, NoNull, Nullability, SomeNull}, page_layout::Layout, - ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Dictionary, FixedSizeBinary, - FixedSizeList, Flat, Fsst, MiniBlockLayout, Nullable, PackedStruct, PageLayout, + AllNullLayout, ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Dictionary, + FixedSizeBinary, FixedSizeList, Flat, Fsst, MiniBlockLayout, Nullable, PackedStruct, + PageLayout, }; use crate::encodings::physical::block_compress::CompressionScheme; @@ -196,7 +197,7 @@ impl ProtobufUtils { } } - pub fn miniblock( + pub fn miniblock_layout( rep_encoding: ArrayEncoding, def_encoding: ArrayEncoding, value_encoding: ArrayEncoding, @@ -209,4 +210,10 @@ impl ProtobufUtils { })), } } + + pub fn all_null_layout() -> PageLayout { + PageLayout { + layout: Some(Layout::AllNullLayout(AllNullLayout {})), + } + } } diff --git a/rust/lance-encoding/src/testing.rs b/rust/lance-encoding/src/testing.rs index 8fc15343f9..d4f6becbe3 100644 --- a/rust/lance-encoding/src/testing.rs +++ b/rust/lance-encoding/src/testing.rs @@ -21,14 +21,14 @@ use lance_datagen::{array, gen, ArrayGenerator, RowCount, Seed}; use crate::{ buffer::LanceBuffer, decoder::{ - BatchDecodeStream, ColumnInfo, CoreFieldDecoderStrategy, DecodeBatchScheduler, - DecoderMessage, DecoderMiddlewareChain, FilterExpression, PageInfo, + create_decode_stream, ColumnInfo, DecodeBatchScheduler, DecoderMessage, DecoderPlugins, + FilterExpression, PageInfo, }, encoder::{ - ColumnIndexSequence, CoreArrayEncodingStrategy, CoreFieldEncodingStrategy, EncodedColumn, - EncodedPage, EncodingOptions, FieldEncoder, FieldEncodingStrategy, OutOfLineBuffers, + default_encoding_strategy, ColumnIndexSequence, CoreArrayEncodingStrategy, + CoreFieldEncodingStrategy, EncodedColumn, EncodedPage, EncodingOptions, FieldEncoder, + FieldEncodingStrategy, OutOfLineBuffers, }, - encodings::logical::r#struct::SimpleStructDecoder, repdef::RepDefBuilder, version::LanceFileVersion, EncodingsIo, @@ -72,37 +72,81 @@ fn column_indices_from_schema_helper( fields: &[FieldRef], column_indices: &mut Vec, column_counter: &mut u32, + is_structural_encoding: bool, ) { - column_indices.push(*column_counter); - *column_counter += 1; + // In the old style, every field except FSL gets its own column. In the new style only primitive + // leaf fields get their own column. for field in fields { match field.data_type() { DataType::Struct(fields) => { - column_indices_from_schema_helper(fields.as_ref(), column_indices, column_counter); + if !is_structural_encoding { + column_indices.push(*column_counter); + *column_counter += 1; + } + column_indices_from_schema_helper( + fields.as_ref(), + column_indices, + column_counter, + is_structural_encoding, + ); } DataType::List(inner) => { - column_indices_from_schema_helper(&[inner.clone()], column_indices, column_counter); + if !is_structural_encoding { + column_indices.push(*column_counter); + *column_counter += 1; + } + column_indices_from_schema_helper( + &[inner.clone()], + column_indices, + column_counter, + is_structural_encoding, + ); } DataType::LargeList(inner) => { - column_indices_from_schema_helper(&[inner.clone()], column_indices, column_counter); + if !is_structural_encoding { + column_indices.push(*column_counter); + *column_counter += 1; + } + column_indices_from_schema_helper( + &[inner.clone()], + column_indices, + column_counter, + is_structural_encoding, + ); } DataType::FixedSizeList(inner, _) => { - // FSL(primitive) does not get its own column - column_indices.pop(); - *column_counter -= 1; - column_indices_from_schema_helper(&[inner.clone()], column_indices, column_counter); + // FSL(primitive) does not get its own column in either approach + column_indices_from_schema_helper( + &[inner.clone()], + column_indices, + column_counter, + is_structural_encoding, + ); } _ => { - column_indices_from_schema_helper(&[], column_indices, column_counter); + column_indices.push(*column_counter); + *column_counter += 1; + + column_indices_from_schema_helper( + &[], + column_indices, + column_counter, + is_structural_encoding, + ); } } } } -fn column_indices_from_schema(schema: &Schema) -> Vec { +fn column_indices_from_schema(schema: &Schema, is_structural_encoding: bool) -> Vec { let mut column_indices = Vec::new(); let mut column_counter = 0; - column_indices_from_schema_helper(schema.fields(), &mut column_indices, &mut column_counter); + column_indices_from_schema_helper( + schema.fields(), + &mut column_indices, + &mut column_counter, + is_structural_encoding, + ); column_indices } @@ -114,29 +158,25 @@ async fn test_decode( column_infos: &[Arc], expected: Option>, io: Arc, + is_structural_encoding: bool, schedule_fn: impl FnOnce( DecodeBatchScheduler, UnboundedSender>, - ) -> (SimpleStructDecoder, BoxFuture<'static, ()>), + ) -> BoxFuture<'static, ()>, ) { let lance_schema = lance_core::datatypes::Schema::try_from(schema).unwrap(); - let decode_and_validate = Arc::new(DecoderMiddlewareChain::new().add_strategy(Arc::new( - CoreFieldDecoderStrategy { - validate_data: true, - }, - ))); let cache = Arc::new(FileMetadataCache::with_capacity( 128 * 1024 * 1024, CapacityMode::Bytes, )); - let column_indices = column_indices_from_schema(schema); + let column_indices = column_indices_from_schema(schema, is_structural_encoding); let decode_scheduler = DecodeBatchScheduler::try_new( &lance_schema, &column_indices, column_infos, &Vec::new(), num_rows, - decode_and_validate, + Arc::::default(), io, cache, &FilterExpression::no_filter(), @@ -146,11 +186,18 @@ async fn test_decode( let (tx, rx) = mpsc::unbounded_channel(); - let (decoder, scheduler_fut) = schedule_fn(decode_scheduler, tx); + let scheduler_fut = schedule_fn(decode_scheduler, tx); scheduler_fut.await; - let mut decode_stream = BatchDecodeStream::new(rx, batch_size, num_rows, decoder).into_stream(); + let mut decode_stream = create_decode_stream( + &lance_schema, + num_rows, + batch_size, + is_structural_encoding, + /*should_validate=*/ true, + rx, + ); let mut offset = 0; while let Some(batch) = decode_stream.next().await { @@ -209,16 +256,11 @@ impl ArrayGeneratorProvider for RandomArrayGeneratorProvider { } /// Given a field this will test the round trip encoding and decoding of random data -pub async fn check_round_trip_encoding_random(field: Field) { +pub async fn check_round_trip_encoding_random(field: Field, version: LanceFileVersion) { let array_generator_provider = RandomArrayGeneratorProvider { field: field.clone(), }; - check_round_trip_encoding_generated( - field, - Box::new(array_generator_provider), - LanceFileVersion::default(), - ) - .await; + check_round_trip_encoding_generated(field, Box::new(array_generator_provider), version).await; } pub async fn check_round_trip_encoding_generated( @@ -276,6 +318,7 @@ pub struct TestCases { batch_size: u32, skip_validation: bool, max_page_size: Option, + page_sizes: Vec, file_version: LanceFileVersion, verify_encoding: Option>, } @@ -288,6 +331,7 @@ impl Default for TestCases { indices: Vec::new(), skip_validation: false, max_page_size: None, + page_sizes: vec![4096, 1024 * 1024], file_version: LanceFileVersion::default(), verify_encoding: None, } @@ -315,6 +359,16 @@ impl TestCases { self } + pub fn with_file_version(mut self, version: LanceFileVersion) -> Self { + self.file_version = version; + self + } + + pub fn with_page_sizes(mut self, page_sizes: Vec) -> Self { + self.page_sizes = page_sizes; + self + } + pub fn with_max_page_size(mut self, max_page_size: u64) -> Self { self.max_page_size = Some(max_page_size); self @@ -324,11 +378,6 @@ impl TestCases { self.max_page_size.unwrap_or(MAX_PAGE_BYTES) } - pub fn with_file_version(mut self, version: LanceFileVersion) -> Self { - self.file_version = version; - self - } - pub fn with_verify_encoding(mut self, verify_encoding: Arc) -> Self { self.verify_encoding = Some(verify_encoding); self @@ -356,22 +405,17 @@ pub async fn check_round_trip_encoding_of_data( let mut field = Field::new("", example_data.data_type().clone(), true); field = field.with_metadata(metadata); let lance_field = lance_core::datatypes::Field::try_from(&field).unwrap(); - for page_size in [4096, 1024 * 1024] { - let encoding_strategy = CoreFieldEncodingStrategy { - array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy { - version: test_cases.file_version, - }), - version: test_cases.file_version, - }; + for page_size in test_cases.page_sizes.iter() { + let encoding_strategy = default_encoding_strategy(test_cases.file_version); let mut column_index_seq = ColumnIndexSequence::default(); let encoding_options = EncodingOptions { - cache_bytes_per_column: page_size, + cache_bytes_per_column: *page_size, max_page_bytes: test_cases.get_max_page_size(), keep_original_array: true, }; let encoder = encoding_strategy .create_field_encoder( - &encoding_strategy, + encoding_strategy.as_ref(), &lance_field, &mut column_index_seq, &encoding_options, @@ -495,8 +539,9 @@ async fn check_round_trip_encoding_inner( column_infos.push(Arc::new(column_info)); } - let scheduler = - Arc::new(SimulatedScheduler::new(writer.encoded_data.freeze())) as Arc; + let encoded_data = writer.encoded_data.freeze(); + + let scheduler = Arc::new(SimulatedScheduler::new(encoded_data)) as Arc; let schema = Schema::new(vec![field.clone()]); @@ -507,6 +552,8 @@ async fn check_round_trip_encoding_inner( Some(concat(&data.iter().map(|arr| arr.as_ref()).collect::>()).unwrap()) }; + let is_structural_encoding = test_cases.file_version >= LanceFileVersion::V2_1; + debug!("Testing full decode"); let scheduler_copy = scheduler.clone(); test_decode( @@ -516,21 +563,17 @@ async fn check_round_trip_encoding_inner( &column_infos, concat_data.clone(), scheduler_copy.clone(), + is_structural_encoding, |mut decode_scheduler, tx| { - #[allow(clippy::single_range_in_vec_init)] - let root_decoder = decode_scheduler.new_root_decoder_ranges(&[0..num_rows]); - ( - root_decoder, - async move { - decode_scheduler.schedule_range( - 0..num_rows, - &FilterExpression::no_filter(), - tx, - scheduler_copy, - ) - } - .boxed(), - ) + async move { + decode_scheduler.schedule_range( + 0..num_rows, + &FilterExpression::no_filter(), + tx, + scheduler_copy, + ) + } + .boxed() }, ) .await; @@ -551,21 +594,17 @@ async fn check_round_trip_encoding_inner( &column_infos, expected, scheduler.clone(), + is_structural_encoding, |mut decode_scheduler, tx| { - #[allow(clippy::single_range_in_vec_init)] - let root_decoder = decode_scheduler.new_root_decoder_ranges(&[0..num_rows]); - ( - root_decoder, - async move { - decode_scheduler.schedule_range( - range, - &FilterExpression::no_filter(), - tx, - scheduler, - ) - } - .boxed(), - ) + async move { + decode_scheduler.schedule_range( + range, + &FilterExpression::no_filter(), + tx, + scheduler, + ) + } + .boxed() }, ) .await; @@ -597,20 +636,17 @@ async fn check_round_trip_encoding_inner( &column_infos, expected, scheduler.clone(), + is_structural_encoding, |mut decode_scheduler, tx| { - let root_decoder = decode_scheduler.new_root_decoder_indices(&indices); - ( - root_decoder, - async move { - decode_scheduler.schedule_take( - &indices, - &FilterExpression::no_filter(), - tx, - scheduler, - ) - } - .boxed(), - ) + async move { + decode_scheduler.schedule_take( + &indices, + &FilterExpression::no_filter(), + tx, + scheduler, + ) + } + .boxed() }, ) .await; diff --git a/rust/lance-file/benches/reader.rs b/rust/lance-file/benches/reader.rs index 1e6d613c56..1762b3a064 100644 --- a/rust/lance-file/benches/reader.rs +++ b/rust/lance-file/benches/reader.rs @@ -1,15 +1,19 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::sync::Arc; +use std::sync::{Arc, Mutex}; +use arrow_array::{cast::AsArray, types::Int32Type}; use arrow_schema::DataType; -use criterion::{criterion_group, criterion_main, Criterion}; -use futures::StreamExt; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; -use lance_file::v2::{ - reader::FileReader, - testing::test_cache, - writer::{FileWriter, FileWriterOptions}, +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use futures::{FutureExt, StreamExt}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; +use lance_file::{ + v2::{ + reader::{FileReader, FileReaderOptions}, + testing::test_cache, + writer::{FileWriter, FileWriterOptions}, + }, + version::LanceFileVersion, }; use lance_io::{ object_store::ObjectStore, @@ -17,66 +21,94 @@ use lance_io::{ }; fn bench_reader(c: &mut Criterion) { - let mut group = c.benchmark_group("reader"); - let data = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(lance_datagen::RowCount::from(1024 * 1024)) - .unwrap(); - let rt = tokio::runtime::Runtime::new().unwrap(); + for version in [LanceFileVersion::V2_0, LanceFileVersion::V2_1] { + let mut group = c.benchmark_group(&format!("reader_{}", version)); + let data = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(lance_datagen::RowCount::from(2 * 1024 * 1024)) + .unwrap(); + let rt = tokio::runtime::Runtime::new().unwrap(); - let tempdir = tempfile::tempdir().unwrap(); - let test_path = tempdir.path(); - let (object_store, base_path) = - ObjectStore::from_path(test_path.as_os_str().to_str().unwrap()).unwrap(); - let file_path = base_path.child("foo.lance"); - let object_writer = rt.block_on(object_store.create(&file_path)).unwrap(); + let tempdir = tempfile::tempdir().unwrap(); + let test_path = tempdir.path(); + let (object_store, base_path) = + ObjectStore::from_path(test_path.as_os_str().to_str().unwrap()).unwrap(); + let file_path = base_path.child("foo.lance"); + let object_writer = rt.block_on(object_store.create(&file_path)).unwrap(); - let mut writer = FileWriter::try_new( - object_writer, - data.schema().as_ref().try_into().unwrap(), - FileWriterOptions::default(), - ) - .unwrap(); - rt.block_on(writer.write_batch(&data)).unwrap(); - rt.block_on(writer.finish()).unwrap(); - group.throughput(criterion::Throughput::Bytes( - data.get_array_memory_size() as u64 - )); - group.bench_function("decode", |b| { - b.iter(|| { - let object_store = &object_store; - let file_path = &file_path; - let data = &data; - rt.block_on(async move { - let store_scheduler = ScanScheduler::new( - Arc::new(object_store.clone()), - SchedulerConfig::default_for_testing(), - ); - let scheduler = store_scheduler.open_file(file_path).await.unwrap(); - let reader = FileReader::try_open( - scheduler.clone(), - None, - Arc::::default(), - &test_cache(), - ) - .await - .unwrap(); - let mut stream = reader - .read_stream( - lance_io::ReadBatchParams::RangeFull, - 16 * 1024, - 16, - FilterExpression::no_filter(), + let mut writer = FileWriter::try_new( + object_writer, + data.schema().as_ref().try_into().unwrap(), + FileWriterOptions { + format_version: Some(version), + ..Default::default() + }, + ) + .unwrap(); + rt.block_on(writer.write_batch(&data)).unwrap(); + rt.block_on(writer.finish()).unwrap(); + group.throughput(criterion::Throughput::Bytes( + data.get_array_memory_size() as u64 + )); + group.bench_function("decode", |b| { + b.iter(|| { + let object_store = &object_store; + let file_path = &file_path; + let data = &data; + rt.block_on(async move { + let store_scheduler = ScanScheduler::new( + Arc::new(object_store.clone()), + SchedulerConfig::default_for_testing(), + ); + let scheduler = store_scheduler.open_file(file_path).await.unwrap(); + let reader = FileReader::try_open( + scheduler.clone(), + None, + Arc::::default(), + &test_cache(), + FileReaderOptions::default(), ) + .await .unwrap(); - let mut row_count = 0; - while let Some(batch) = stream.next().await { - row_count += batch.unwrap().num_rows(); - } - assert_eq!(data.num_rows(), row_count); - }); - }) - }); + let stream = reader + .read_tasks( + lance_io::ReadBatchParams::RangeFull, + 16 * 1024, + None, + FilterExpression::no_filter(), + ) + .unwrap(); + let stats = Arc::new(Mutex::new((0, 0))); + let mut stream = stream + .map(|batch_task| { + let stats = stats.clone(); + async move { + let batch = batch_task.task.await.unwrap(); + let row_count = batch.num_rows(); + let sum = batch + .column(0) + .as_primitive::() + .values() + .iter() + .map(|v| *v as i64) + .sum::(); + let mut stats = stats.lock().unwrap(); + stats.0 += row_count; + stats.1 += sum; + } + .boxed() + }) + .buffer_unordered(16); + while let Some(_) = stream.next().await {} + let stats = stats.lock().unwrap(); + let row_count = stats.0; + let sum = stats.1; + assert_eq!(data.num_rows(), row_count); + black_box(sum); + }); + }) + }); + } } #[cfg(target_os = "linux")] diff --git a/rust/lance-file/src/v2/reader.rs b/rust/lance-file/src/v2/reader.rs index 59a4e092ae..292d52fa03 100644 --- a/rust/lance-file/src/v2/reader.rs +++ b/rust/lance-file/src/v2/reader.rs @@ -16,8 +16,8 @@ use deepsize::{Context, DeepSizeOf}; use futures::{stream::BoxStream, Stream, StreamExt}; use lance_encoding::{ decoder::{ - schedule_and_decode, ColumnInfo, DecoderMiddlewareChain, FilterExpression, PageEncoding, - PageInfo, ReadBatchTask, RequestedRows, SchedulerDecoderConfig, + schedule_and_decode, ColumnInfo, DecoderPlugins, FilterExpression, PageEncoding, PageInfo, + ReadBatchTask, RequestedRows, SchedulerDecoderConfig, }, encoder::EncodedBatch, version::LanceFileVersion, @@ -166,19 +166,27 @@ pub struct ReaderProjection { impl ReaderProjection { fn from_field_ids_helper<'a>( + reader: &FileReader, fields: impl Iterator, field_id_to_column_index: &BTreeMap, column_indices: &mut Vec, ) -> Result<()> { for field in fields { - if let Some(column_idx) = field_id_to_column_index.get(&(field.id as u32)).copied() { - column_indices.push(column_idx); - Self::from_field_ids_helper( - field.children.iter(), - field_id_to_column_index, - column_indices, - )?; + let is_structural = reader.metadata.version() >= LanceFileVersion::V2_1; + // In the 2.0 system we needed ids for intermediate fields. In 2.1+ + // we only need ids for leaf fields. + if !is_structural || field.children.is_empty() { + if let Some(column_idx) = field_id_to_column_index.get(&(field.id as u32)).copied() + { + column_indices.push(column_idx); + } } + Self::from_field_ids_helper( + reader, + field.children.iter(), + field_id_to_column_index, + column_indices, + )?; } Ok(()) } @@ -188,11 +196,13 @@ impl ReaderProjection { /// You can obtain such a mapping when the file is written using the /// [`crate::v2::writer::FileWriter::field_id_to_column_indices`] method. pub fn from_field_ids( + reader: &FileReader, schema: &Schema, field_id_to_column_index: &BTreeMap, ) -> Result { let mut column_indices = Vec::new(); Self::from_field_ids_helper( + reader, schema.fields.iter(), field_id_to_column_index, &mut column_indices, @@ -207,12 +217,22 @@ impl ReaderProjection { /// /// If the schema provided is not the schema of the entire file then /// the projection will be invalid and the read will fail. - pub fn from_whole_schema(schema: &Schema) -> Self { + pub fn from_whole_schema(schema: &Schema, version: LanceFileVersion) -> Self { let schema = Arc::new(schema.clone()); + let is_structural = version >= LanceFileVersion::V2_1; + let mut counter = 0; + let counter = &mut counter; let column_indices = schema .fields_pre_order() - .enumerate() - .map(|(idx, _)| idx as u32) + .filter_map(|field| { + if field.children.is_empty() || !is_structural { + let col_idx = *counter; + *counter += 1; + Some(col_idx) + } else { + None + } + }) .collect::>(); Self { schema, @@ -244,6 +264,11 @@ impl ReaderProjection { } } +#[derive(Debug, Default)] +pub struct FileReaderOptions { + validate_on_decode: bool, +} + #[derive(Debug)] pub struct FileReader { scheduler: Arc, @@ -251,10 +276,10 @@ pub struct FileReader { base_projection: ReaderProjection, num_rows: u64, metadata: Arc, - decoder_strategy: Arc, + decoder_plugins: Arc, cache: Arc, + options: FileReaderOptions, } - #[derive(Debug)] struct Footer { #[allow(dead_code)] @@ -630,8 +655,9 @@ impl FileReader { pub async fn try_open( scheduler: FileScheduler, base_projection: Option, - decoder_strategy: Arc, + decoder_strategy: Arc, cache: &FileMetadataCache, + options: FileReaderOptions, ) -> Result { let file_metadata = Arc::new(Self::read_all_metadata(&scheduler).await?); Self::try_open_with_file_metadata( @@ -640,6 +666,7 @@ impl FileReader { decoder_strategy, file_metadata, cache, + options, ) .await } @@ -648,9 +675,10 @@ impl FileReader { pub async fn try_open_with_file_metadata( scheduler: FileScheduler, base_projection: Option, - decoder_strategy: Arc, + decoder_plugins: Arc, file_metadata: Arc, cache: &FileMetadataCache, + options: FileReaderOptions, ) -> Result { let cache = Arc::new(cache.with_base_path(scheduler.reader().path().clone())); @@ -662,11 +690,13 @@ impl FileReader { scheduler: Arc::new(LanceEncodingsIo(scheduler)), base_projection: base_projection.unwrap_or(ReaderProjection::from_whole_schema( file_metadata.file_schema.as_ref(), + file_metadata.version(), )), num_rows, metadata: file_metadata, - decoder_strategy, + decoder_plugins, cache, + options, }) } @@ -696,11 +726,12 @@ impl FileReader { io: Arc, cache: Arc, num_rows: u64, - decoder_strategy: Arc, + decoder_plugins: Arc, range: Range, batch_size: u32, projection: ReaderProjection, filter: FilterExpression, + should_validate: bool, ) -> Result> { debug!( "Reading range {:?} with batch_size {} from file with {} rows and {} columns into schema with {} columns", @@ -714,8 +745,9 @@ impl FileReader { let config = SchedulerDecoderConfig { batch_size, cache, - decoder_strategy, + decoder_plugins, io, + should_validate, }; let requested_rows = RequestedRows::Ranges(vec![range]); @@ -743,11 +775,12 @@ impl FileReader { self.scheduler.clone(), self.cache.clone(), self.num_rows, - self.decoder_strategy.clone(), + self.decoder_plugins.clone(), range, batch_size, projection, filter, + self.options.validate_on_decode, ) } @@ -756,11 +789,12 @@ impl FileReader { column_infos: Vec>, io: Arc, cache: Arc, - decoder_strategy: Arc, + decoder_plugins: Arc, indices: Vec, batch_size: u32, projection: ReaderProjection, filter: FilterExpression, + should_validate: bool, ) -> Result> { debug!( "Taking {} rows spread across range {}..{} with batch_size {} from columns {:?}", @@ -774,8 +808,9 @@ impl FileReader { let config = SchedulerDecoderConfig { batch_size, cache, - decoder_strategy, + decoder_plugins, io, + should_validate, }; let requested_rows = RequestedRows::Indices(indices); @@ -801,11 +836,12 @@ impl FileReader { self.collect_columns_from_projection(&projection)?, self.scheduler.clone(), self.cache.clone(), - self.decoder_strategy.clone(), + self.decoder_plugins.clone(), indices, batch_size, projection, FilterExpression::no_filter(), + self.options.validate_on_decode, ) } @@ -823,9 +859,10 @@ impl FileReader { &self, params: ReadBatchParams, batch_size: u32, - projection: ReaderProjection, + projection: Option, filter: FilterExpression, ) -> Result + Send>>> { + let projection = projection.unwrap_or_else(|| self.base_projection.clone()); Self::validate_projection(&projection, &self.metadata)?; let verify_bound = |params: &ReadBatchParams, bound: u64, inclusive: bool| { if bound > self.num_rows || bound == self.num_rows && inclusive { @@ -916,7 +953,7 @@ impl FileReader { filter: FilterExpression, ) -> Result>> { let arrow_schema = Arc::new(ArrowSchema::from(projection.schema.as_ref())); - let tasks_stream = self.read_tasks(params, batch_size, projection, filter)?; + let tasks_stream = self.read_tasks(params, batch_size, Some(projection), filter)?; let batch_stream = tasks_stream .map(|task| task.task) .buffered(batch_readahead as usize) @@ -993,7 +1030,11 @@ pub fn describe_encoding(page: &pbfile::column_metadata::Page) -> String { } pub trait EncodedBatchReaderExt { - fn try_from_mini_lance(bytes: Bytes, schema: &Schema) -> Result + fn try_from_mini_lance( + bytes: Bytes, + schema: &Schema, + version: LanceFileVersion, + ) -> Result where Self: Sized; fn try_from_self_described_lance(bytes: Bytes) -> Result @@ -1002,11 +1043,15 @@ pub trait EncodedBatchReaderExt { } impl EncodedBatchReaderExt for EncodedBatch { - fn try_from_mini_lance(bytes: Bytes, schema: &Schema) -> Result + fn try_from_mini_lance( + bytes: Bytes, + schema: &Schema, + file_version: LanceFileVersion, + ) -> Result where Self: Sized, { - let projection = ReaderProjection::from_whole_schema(schema); + let projection = ReaderProjection::from_whole_schema(schema, file_version); let footer = FileReader::decode_footer(&bytes)?; // Next, read the metadata for the columns @@ -1041,6 +1086,10 @@ impl EncodedBatchReaderExt for EncodedBatch { Self: Sized, { let footer = FileReader::decode_footer(&bytes)?; + let file_version = LanceFileVersion::try_from_major_minor( + footer.major_version as u32, + footer.minor_version as u32, + )?; let gbo_table = FileReader::do_decode_gbo_table( &bytes.slice(footer.global_buff_offsets_start as usize..), @@ -1057,7 +1106,7 @@ impl EncodedBatchReaderExt for EncodedBatch { let schema_bytes = bytes.slice(schema_start..(schema_start + schema_size)); let (_, schema) = FileReader::decode_schema(schema_bytes)?; - let projection = ReaderProjection::from_whole_schema(&schema); + let projection = ReaderProjection::from_whole_schema(&schema, file_version); // Next, read the metadata for the columns // This is both the column metadata and the CMO table @@ -1067,11 +1116,6 @@ impl EncodedBatchReaderExt for EncodedBatch { let column_metadatas = FileReader::read_all_column_metadata(column_metadata_bytes, &footer)?; - let file_version = LanceFileVersion::try_from_major_minor( - footer.major_version as u32, - footer.minor_version as u32, - )?; - let page_table = FileReader::meta_to_col_infos(&column_metadatas, file_version); Ok(Self { @@ -1102,15 +1146,16 @@ pub mod tests { use lance_core::datatypes::Schema; use lance_datagen::{array, gen, BatchCount, ByteCount, RowCount}; use lance_encoding::{ - decoder::{decode_batch, DecodeBatchScheduler, DecoderMiddlewareChain, FilterExpression}, + decoder::{decode_batch, DecodeBatchScheduler, DecoderPlugins, FilterExpression}, encoder::{encode_batch, CoreFieldEncodingStrategy, EncodedBatch, EncodingOptions}, + version::LanceFileVersion, }; use lance_io::stream::RecordBatchStream; use log::debug; use tokio::sync::mpsc; use crate::v2::{ - reader::{EncodedBatchReaderExt, FileReader, ReaderProjection}, + reader::{EncodedBatchReaderExt, FileReader, FileReaderOptions, ReaderProjection}, testing::{test_cache, write_lance_file, FsFixture, WrittenFile}, writer::{EncodedBatchWriteExt, FileWriter, FileWriterOptions}, }; @@ -1196,8 +1241,9 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler, None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); @@ -1250,7 +1296,8 @@ pub mod tests { let decoded = decode_batch( &decoded_batch, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, ) .await .unwrap(); @@ -1260,11 +1307,13 @@ pub mod tests { // Test mini let bytes = encoded_batch.try_to_mini_lance().unwrap(); let decoded_batch = - EncodedBatch::try_from_mini_lance(bytes, lance_schema.as_ref()).unwrap(); + EncodedBatch::try_from_mini_lance(bytes, lance_schema.as_ref(), LanceFileVersion::V2_0) + .unwrap(); let decoded = decode_batch( &decoded_batch, &FilterExpression::no_filter(), - Arc::::default(), + Arc::::default(), + false, ) .await .unwrap(); @@ -1300,15 +1349,20 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); let projected_schema = written_file.schema.project(&columns).unwrap(); - let projection = - ReaderProjection::from_field_ids(&projected_schema, &field_id_mapping).unwrap(); + let projection = ReaderProjection::from_field_ids( + &file_reader, + &projected_schema, + &field_id_mapping, + ) + .unwrap(); let batch_stream = file_reader .read_stream_projected( @@ -1335,8 +1389,9 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), Some(projection.clone()), - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); @@ -1370,8 +1425,9 @@ pub mod tests { assert!(FileReader::try_open( file_scheduler.clone(), Some(empty_projection), - Arc::::default(), - &test_cache() + Arc::::default(), + &test_cache(), + FileReaderOptions::default(), ) .await .is_err()); @@ -1390,8 +1446,9 @@ pub mod tests { assert!(FileReader::try_open( file_scheduler.clone(), Some(projection_with_dupes), - Arc::::default(), - &test_cache() + Arc::::default(), + &test_cache(), + FileReaderOptions::default(), ) .await .is_err()); @@ -1408,8 +1465,9 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); @@ -1457,8 +1515,9 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); @@ -1488,8 +1547,9 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); @@ -1535,13 +1595,15 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); - let projection = ReaderProjection::from_whole_schema(&written_file.schema); + let projection = + ReaderProjection::from_whole_schema(&written_file.schema, LanceFileVersion::V2_0); let column_infos = file_reader .collect_columns_from_projection(&projection) .unwrap(); @@ -1551,7 +1613,7 @@ pub mod tests { &column_infos, &vec![], total_rows as u64, - Arc::::default(), + Arc::::default(), file_reader.scheduler.clone(), test_cache(), &FilterExpression::no_filter(), @@ -1609,8 +1671,9 @@ pub mod tests { let file_reader = FileReader::try_open( file_scheduler.clone(), None, - Arc::::default(), + Arc::::default(), &test_cache(), + FileReaderOptions::default(), ) .await .unwrap(); diff --git a/rust/lance-file/src/v2/testing.rs b/rust/lance-file/src/v2/testing.rs index a484e85b43..af6205d042 100644 --- a/rust/lance-file/src/v2/testing.rs +++ b/rust/lance-file/src/v2/testing.rs @@ -10,7 +10,7 @@ use lance_core::{ cache::{CapacityMode, FileMetadataCache}, datatypes::Schema, }; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_io::{ object_store::ObjectStore, scheduler::{ScanScheduler, SchedulerConfig}, @@ -19,7 +19,7 @@ use lance_io::{ use object_store::path::Path; use tempfile::TempDir; -use crate::v2::reader::FileReader; +use crate::v2::reader::{FileReader, FileReaderOptions}; use super::writer::{FileWriter, FileWriterOptions}; @@ -91,13 +91,19 @@ pub fn test_cache() -> Arc { pub async fn read_lance_file( fs: &FsFixture, - decoder_middleware: Arc, + decoder_middleware: Arc, filter: FilterExpression, ) -> Vec { let file_scheduler = fs.scheduler.open_file(&fs.tmp_path).await.unwrap(); - let file_reader = FileReader::try_open(file_scheduler, None, decoder_middleware, &test_cache()) - .await - .unwrap(); + let file_reader = FileReader::try_open( + file_scheduler, + None, + decoder_middleware, + &test_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); let schema = file_reader.schema(); assert_eq!(schema.metadata.get("foo").unwrap(), "bar"); @@ -111,7 +117,7 @@ pub async fn read_lance_file( pub async fn count_lance_file( fs: &FsFixture, - decoder_middleware: Arc, + decoder_middleware: Arc, filter: FilterExpression, ) -> usize { read_lance_file(fs, decoder_middleware, filter) diff --git a/rust/lance-file/src/v2/writer.rs b/rust/lance-file/src/v2/writer.rs index 30155a5b35..3e12d3dbbb 100644 --- a/rust/lance-file/src/v2/writer.rs +++ b/rust/lance-file/src/v2/writer.rs @@ -14,8 +14,8 @@ use lance_core::datatypes::Schema as LanceSchema; use lance_core::{Error, Result}; use lance_encoding::decoder::PageEncoding; use lance_encoding::encoder::{ - BatchEncoder, CoreArrayEncodingStrategy, CoreFieldEncodingStrategy, EncodeTask, EncodedBatch, - EncodedPage, EncodingOptions, FieldEncoder, FieldEncodingStrategy, OutOfLineBuffers, + default_encoding_strategy, BatchEncoder, EncodeTask, EncodedBatch, EncodedPage, + EncodingOptions, FieldEncoder, FieldEncodingStrategy, OutOfLineBuffers, }; use lance_encoding::repdef::RepDefBuilder; use lance_encoding::version::LanceFileVersion; @@ -219,10 +219,7 @@ impl FileWriter { let keep_original_array = self.options.keep_original_array.unwrap_or(false); let encoding_strategy = self.options.encoding_strategy.clone().unwrap_or_else(|| { let version = self.version(); - Arc::new(CoreFieldEncodingStrategy { - array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy { version }), - version, - }) + default_encoding_strategy(version).into() }); let encoding_options = EncodingOptions { diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 09a6329784..8c708dd589 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -13,8 +13,9 @@ use async_trait::async_trait; use deepsize::DeepSizeOf; use futures::TryStreamExt; use lance_core::{cache::FileMetadataCache, Error, Result}; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::v2; +use lance_file::v2::reader::FileReaderOptions; use lance_file::writer::FileWriterOptions; use lance_file::{ reader::FileReader, @@ -162,7 +163,10 @@ impl IndexReader for v2::reader::FileReader { let projection = if let Some(projection) = projection { v2::reader::ReaderProjection::from_column_names(self.schema(), projection)? } else { - v2::reader::ReaderProjection::from_whole_schema(self.schema()) + v2::reader::ReaderProjection::from_whole_schema( + self.schema(), + self.metadata().version(), + ) }; let batches = self .read_stream_projected( @@ -236,8 +240,9 @@ impl IndexStore for LanceIndexStore { match v2::reader::FileReader::try_open( file_scheduler, None, - Arc::::default(), + Arc::::default(), &self.metadata_cache, + FileReaderOptions::default(), ) .await { diff --git a/rust/lance-index/src/vector/ivf/shuffler.rs b/rust/lance-index/src/vector/ivf/shuffler.rs index f5998e4c3c..fa1d7dc2ca 100644 --- a/rust/lance-index/src/vector/ivf/shuffler.rs +++ b/rust/lance-index/src/vector/ivf/shuffler.rs @@ -30,9 +30,9 @@ use lance_arrow::RecordBatchExt; use lance_core::cache::{CapacityMode, FileMetadataCache}; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{datatypes::Schema, Error, Result, ROW_ID}; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::reader::FileReader; -use lance_file::v2::reader::FileReader as Lancev2FileReader; +use lance_file::v2::reader::{FileReader as Lancev2FileReader, FileReaderOptions}; use lance_file::v2::writer::FileWriterOptions; use lance_file::writer::FileWriter; use lance_io::object_store::ObjectStore; @@ -515,8 +515,14 @@ impl IvfShuffler { let cache = FileMetadataCache::with_capacity(128 * 1024 * 1024, CapacityMode::Bytes); - let reader = - Lancev2FileReader::try_open(file, None, Default::default(), &cache).await?; + let reader = Lancev2FileReader::try_open( + file, + None, + Default::default(), + &cache, + FileReaderOptions::default(), + ) + .await?; let num_batches = reader.metadata().num_rows / (SHUFFLE_BATCH_SIZE as u64); total_batches.push(num_batches as usize); } @@ -569,6 +575,7 @@ impl IvfShuffler { None, Default::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await?; let mut stream = reader @@ -640,6 +647,7 @@ impl IvfShuffler { None, Default::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await?; reader @@ -812,8 +820,9 @@ impl IvfShuffler { let reader = lance_file::v2::reader::FileReader::try_open( file_scheduler, None, - Arc::::default(), + Arc::::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await?; let stream = reader diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs index b35d2d4a3a..f4fcd1020b 100644 --- a/rust/lance-index/src/vector/v3/shuffler.rs +++ b/rust/lance-index/src/vector/v3/shuffler.rs @@ -16,8 +16,11 @@ use lance_core::{ utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}, Error, Result, }; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; -use lance_file::v2::{reader::FileReader, writer::FileWriter}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; +use lance_file::v2::{ + reader::{FileReader, FileReaderOptions}, + writer::FileWriter, +}; use lance_io::{ object_store::ObjectStore, scheduler::{ScanScheduler, SchedulerConfig}, @@ -248,8 +251,9 @@ impl ShuffleReader for IvfShufflerReader { let reader = FileReader::try_open( self.scheduler.open_file(&partition_path).await?, None, - Arc::::default(), + Arc::::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await?; let schema = reader.schema().as_ref().into(); diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index d1242558ea..2e6b71aead 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -95,7 +95,7 @@ lance-testing = { workspace = true } tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } env_logger = "0.10.0" tracing-chrome = "0.7.1" -rstest = "0.19.0" +rstest = { workspace = true } random_word = { version = "0.4.3", features = ["en"] } diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 97f16f2224..8439f51270 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -23,9 +23,9 @@ use lance_core::utils::deletion::DeletionVector; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{datatypes::Schema, Error, Result}; use lance_core::{ROW_ADDR, ROW_ADDR_FIELD, ROW_ID_FIELD}; -use lance_encoding::decoder::DecoderMiddlewareChain; +use lance_encoding::decoder::DecoderPlugins; use lance_file::reader::{read_batch, FileReader}; -use lance_file::v2::reader::{CachedFileMetadata, ReaderProjection}; +use lance_file::v2::reader::{CachedFileMetadata, FileReaderOptions, ReaderProjection}; use lance_file::version::LanceFileVersion; use lance_file::{determine_file_version, v2}; use lance_io::object_store::ObjectStore; @@ -291,6 +291,7 @@ mod v2_adapter { projection: Arc, ) -> Result { let projection = ReaderProjection::from_field_ids( + self.reader.as_ref(), projection.as_ref(), self.field_id_to_column_idx.as_ref(), )?; @@ -299,7 +300,7 @@ mod v2_adapter { .read_tasks( ReadBatchParams::Range(range.start as usize..range.end as usize), batch_size, - projection, + Some(projection), FilterExpression::no_filter(), )? .map(|v2_task| ReadBatchTask { @@ -315,6 +316,7 @@ mod v2_adapter { projection: Arc, ) -> Result { let projection = ReaderProjection::from_field_ids( + self.reader.as_ref(), projection.as_ref(), self.field_id_to_column_idx.as_ref(), )?; @@ -323,7 +325,7 @@ mod v2_adapter { .read_tasks( ReadBatchParams::RangeFull, batch_size, - projection, + Some(projection), FilterExpression::no_filter(), )? .map(|v2_task| ReadBatchTask { @@ -341,6 +343,7 @@ mod v2_adapter { ) -> Result { let indices = UInt32Array::from(indices.to_vec()); let projection = ReaderProjection::from_field_ids( + self.reader.as_ref(), projection.as_ref(), self.field_id_to_column_idx.as_ref(), )?; @@ -349,7 +352,7 @@ mod v2_adapter { .read_tasks( ReadBatchParams::Indices(indices), batch_size, - projection, + Some(projection), FilterExpression::no_filter(), )? .map(|v2_task| ReadBatchTask { @@ -474,15 +477,19 @@ impl FileFragment { let reader = v2::reader::FileReader::try_open( file_scheduler, None, - Arc::::default(), + Arc::::default(), &dataset.session.file_metadata_cache, + FileReaderOptions::default(), ) .await?; // If the schemas are not compatible we can't calculate field id offsets reader .schema() .check_compatible(dataset.schema(), &SchemaCompareOptions::default())?; - let projection = v2::reader::ReaderProjection::from_whole_schema(dataset.schema()); + let projection = v2::reader::ReaderProjection::from_whole_schema( + dataset.schema(), + reader.metadata().version(), + ); let physical_rows = reader.metadata().num_rows as usize; frag.physical_rows = Some(physical_rows); frag.id = fragment_id as u64; @@ -666,9 +673,10 @@ impl FileFragment { v2::reader::FileReader::try_open_with_file_metadata( file_scheduler, None, - Arc::::default(), + Arc::::default(), file_metadata, &self.dataset.session.file_metadata_cache, + FileReaderOptions::default(), ) .await?, ); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index d468084571..829325e399 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -13,6 +13,7 @@ use futures::{stream, StreamExt, TryStreamExt}; use itertools::Itertools; use lance_file::reader::FileReader; use lance_file::v2; +use lance_file::v2::reader::FileReaderOptions; use lance_index::optimize::OptimizeOptions; use lance_index::pb::index::Implementation; use lance_index::scalar::expression::{ @@ -693,6 +694,7 @@ impl DatasetIndexInternalExt for Dataset { None, Default::default(), &self.session.file_metadata_cache, + FileReaderOptions::default(), ) .await?; let index_metadata = reader diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 1dcee9c493..63048f206e 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -11,7 +11,8 @@ use lance_arrow::RecordBatchExt; use lance_core::cache::FileMetadataCache; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{Error, Result, ROW_ID_FIELD}; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; +use lance_file::v2::reader::FileReaderOptions; use lance_file::v2::{reader::FileReader, writer::FileWriter}; use lance_index::vector::flat::storage::FlatStorage; use lance_index::vector::ivf::storage::IvfModel; @@ -500,8 +501,9 @@ impl IvfIndexBuilde let reader = FileReader::try_open( scheduler.open_file(&storage_part_path).await?, None, - Arc::::default(), + Arc::::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await?; let batches = reader @@ -533,8 +535,9 @@ impl IvfIndexBuilde let reader = FileReader::try_open( scheduler.open_file(&index_part_path).await?, None, - Arc::::default(), + Arc::::default(), &FileMetadataCache::no_cache(), + FileReaderOptions::default(), ) .await?; let batches = reader diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 2f359aa610..d9161b6cdd 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -24,8 +24,8 @@ use lance_arrow::RecordBatchExt; use lance_core::cache::FileMetadataCache; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{Error, Result}; -use lance_encoding::decoder::{DecoderMiddlewareChain, FilterExpression}; -use lance_file::v2::reader::FileReader; +use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; +use lance_file::v2::reader::{FileReader, FileReaderOptions}; use lance_index::vector::flat::index::{FlatIndex, FlatQuantizer}; use lance_index::vector::hnsw::HNSW; use lance_index::vector::ivf::storage::IvfModel; @@ -128,8 +128,9 @@ impl IVFIndex { .open_file(&index_dir.child(uuid.as_str()).child(INDEX_FILE_NAME)) .await?, None, - Arc::::default(), + Arc::::default(), &file_metadata_cache, + FileReaderOptions::default(), ) .await?; let index_metadata: IndexMetadata = serde_json::from_str( @@ -180,8 +181,9 @@ impl IVFIndex { ) .await?, None, - Arc::::default(), + Arc::::default(), &file_metadata_cache, + FileReaderOptions::default(), ) .await?; let storage = IvfQuantizationStorage::try_new(storage_reader).await?;