diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 79bd270ce54d2..cf5c44c1c964a 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -595,6 +595,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): ), column_encoding=self._properties["column_encoding"], data_page_version=self._properties["data_page_version"], + encryption_properties=self._properties["encryption_properties"], + write_batch_size=self._properties["write_batch_size"], + dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"], + write_page_index=self._properties["write_page_index"], ) def _set_arrow_properties(self): @@ -631,6 +635,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): coerce_timestamps=None, allow_truncated_timestamps=False, use_compliant_nested_type=True, + encryption_properties=None, + write_batch_size=None, + dictionary_pagesize_limit=None, + write_page_index=False, ) self._set_properties() self._set_arrow_properties() diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b8a0c38089980..e0988f2752033 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5291,6 +5291,38 @@ def test_write_dataset_preserve_field_metadata(tempdir): assert dataset.to_table().schema.equals(schema_metadata, check_metadata=True) +def test_write_dataset_write_page_index(tempdir): + for write_statistics in [True, False]: + for write_page_index in [True, False]: + schema = pa.schema([ + pa.field("x", pa.int64()), + pa.field("y", pa.int64())]) + + arrays = [[1, 2, 3], [None, 5, None]] + table = pa.Table.from_arrays(arrays, schema=schema) + + file_format = ds.ParquetFileFormat() + base_dir = tempdir / f"write_page_index_{write_page_index}" + ds.write_dataset( + table, + base_dir, + format="parquet", + file_options=file_format.make_write_options( + write_statistics=write_statistics, + write_page_index=write_page_index, + ), + existing_data_behavior='overwrite_or_ignore', + ) + ds1 = ds.dataset(base_dir, format="parquet") + + for file in ds1.files: + # Can retrieve sorting columns from metadata + metadata = pq.read_metadata(file) + cc = metadata.row_group(0).column(0) + assert cc.has_offset_index is write_page_index + assert cc.has_column_index is write_page_index & write_statistics + + @pytest.mark.parametrize('dstype', [ "fs", "mem" ])