GH-23870: [Python] Ensure parquet.write_to_dataset doesn't create emp…

…ty files for non-observed dictionary (category) values (#36465) ### What changes are included in this PR? If we partition on a categorical variable with "unobserved" categories (values present in the dictionary, but not in the actual data), the legacy path in `pq.write_to_dataset` currently creates empty files. The new dataset-based path already has the preferred behavior, and this PR fixes it for the legacy path and adds a test for both as well. This also fixes one of the pandas deprecation warnings listed in #36412 ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, this no longer creates a hive-style directory with one empty file (parquet file with 0 rows) when users have unobserved categories. However, this aligns the legacy path with the new and default dataset-based path. * Closes: #23870 Authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
apache · Jul 5, 2023 · 20d5c31 · 20d5c31
1 parent b116b8a
commit 20d5c31
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 1 deletion.
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
@@ -3468,7 +3468,7 @@ def file_visitor(written_file):
         if len(partition_keys) == 1:
             partition_keys = partition_keys[0]
 
-        for keys, subgroup in data_df.groupby(partition_keys):
+        for keys, subgroup in data_df.groupby(partition_keys, observed=True):
             if not isinstance(keys, tuple):
                 keys = (keys,)
             subdir = '/'.join(

diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -1932,3 +1932,24 @@ def test_write_to_dataset_kwargs_passed(tempdir, write_dataset_kwarg):
         pq.write_to_dataset(table, path, **{key: arg})
         _name, _args, kwargs = mock_write_dataset.mock_calls[0]
         assert kwargs[key] == arg
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset):
+    # if we partition on a categorical variable with "unobserved" categories
+    # (values present in the dictionary, but not in the actual data)
+    # ensure those are not creating empty files/directories
+    df = pd.DataFrame({
+        "cat": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
+        "col": [1, 2, 3]
+    })
+    table = pa.table(df)
+    path = tempdir / "dataset"
+    pq.write_to_dataset(
+        table, tempdir / "dataset", partition_cols=["cat"],
+        use_legacy_dataset=use_legacy_dataset
+    )
+    subdirs = [f.name for f in path.iterdir() if f.is_dir()]
+    assert len(subdirs) == 2
+    assert "cat=c" not in subdirs