-
Notifications
You must be signed in to change notification settings - Fork 149
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[PERF] Native Parquet Bulk Reader (#1233)
* Adds parquet bulk reader which can parallelize across files for IO and compute * Yielding up to a 5.3x speed up compared to the daft non bulk reader. ![Daft Bulk Reader Relative Speedup (Single Column)](https://github.com/Eventual-Inc/Daft/assets/2550285/be310944-2348-4f85-9441-9695f5b676d9)
- Loading branch information
Showing
6 changed files
with
176 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from __future__ import annotations | ||
|
||
import pytest | ||
|
||
PATH = ( | ||
"s3://eventual-dev-benchmarking-fixtures/parquet-benchmarking/tpch/200MB-2RG/daft_200MB_lineitem_chunk.RG-2.parquet" | ||
) | ||
|
||
|
||
@pytest.mark.benchmark(group="num_files_single_column") | ||
@pytest.mark.parametrize( | ||
"num_files", | ||
[1, 2, 4, 8], | ||
) | ||
def test_read_parquet_num_files_single_column(num_files, bulk_read_fn, benchmark): | ||
data = benchmark(bulk_read_fn, [PATH] * num_files, columns=["L_ORDERKEY"]) | ||
assert len(data) == num_files | ||
# Make sure the data is correct | ||
for i in range(num_files): | ||
assert data[i].column_names == ["L_ORDERKEY"] | ||
assert len(data[i]) == 5515199 | ||
|
||
|
||
@pytest.mark.benchmark(group="num_rowgroups_all_columns") | ||
@pytest.mark.parametrize( | ||
"num_files", | ||
[1, 2, 4], | ||
) | ||
def test_read_parquet_num_files_all_columns(num_files, bulk_read_fn, benchmark): | ||
data = benchmark(bulk_read_fn, [PATH] * num_files) | ||
assert len(data) == num_files | ||
|
||
# Make sure the data is correct | ||
for i in range(num_files): | ||
assert len(data[i].column_names) == 16 | ||
assert len(data[i]) == 5515199 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters