GH-43684: [Python][Dataset] Python / Cython interface to C++ arrow::d…

…ataset::Partitioning::Format (#43740) See #43684 * GitHub Issue: #43684 Lead-authored-by: feiyang <[email protected]> Co-authored-by: Feiyang472 <[email protected]> Co-authored-by: Bryce Mecum <[email protected]> Signed-off-by: Bryce Mecum <[email protected]>
apache · Oct 8, 2024 · 44fb439 · 44fb439
1 parent 505e3e3
commit 44fb439
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 0 deletions.
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -2505,6 +2505,43 @@ cdef class Partitioning(_Weakrefable):
         result = self.partitioning.Parse(tobytes(path))
         return Expression.wrap(GetResultValue(result))
 
+    def format(self, expr):
+        """
+        Convert a filter expression into a tuple of (directory, filename) using 
+        the current partitioning scheme
+
+        Parameters
+        ----------
+        expr : pyarrow.dataset.Expression
+
+        Returns
+        -------
+        tuple[str, str]
+
+        Examples
+        --------
+
+        Specify the Schema for paths like "/2009/June":
+
+        >>> import pyarrow as pa
+        >>> import pyarrow.dataset as ds
+        >>> import pyarrow.compute as pc
+        >>> part = ds.partitioning(pa.schema([("year", pa.int16()),
+        ...                                   ("month", pa.string())]))
+        >>> part.format(
+        ...     (pc.field("year") == 1862) & (pc.field("month") == "Jan")
+        ... )
+        ('1862/Jan', '')
+        """
+        cdef:
+            CPartitionPathFormat result
+
+        result = GetResultValue(self.partitioning.Format(
+            Expression.unwrap(expr)
+        ))
+
+        return frombytes(result.directory), frombytes(result.filename)
+
     @property
     def schema(self):
         """The arrow Schema attached to the partitioning."""

diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -285,9 +285,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         CJSONParseOptions parse_options
         CJSONReadOptions read_options
 
+    cdef struct CPartitionPathFormat "arrow::dataset::PartitionPathFormat":
+        c_string directory
+        c_string filename
+
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const
         CResult[CExpression] Parse(const c_string & path) const
+        CResult[CPartitionPathFormat] Format(const CExpression & expr) const
         const shared_ptr[CSchema] & schema()
         c_bool Equals(const CPartitioning& other) const
 

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -734,6 +734,73 @@ def test_partitioning_pickling(pickle_module):
         assert pickle_module.loads(pickle_module.dumps(part)) == part
 
 
+@pytest.mark.parametrize(
+    "flavor, expected_defined_partition, expected_undefined_partition",
+    [
+        (ds.HivePartitioning, (r"foo=A/bar=ant%20bee", ""), ("", "")),
+        (ds.DirectoryPartitioning, (r"A/ant bee", ""), ("", "")),
+        (ds.FilenamePartitioning, ("", r"A_ant bee_"), ("", "_")),
+    ],
+)
+def test_dataset_partitioning_format(
+    flavor: "ds.Partitioning",
+    expected_defined_partition: tuple,
+    expected_undefined_partition: tuple,
+):
+
+    partitioning_schema = pa.schema([("foo", pa.string()), ("bar", pa.string())])
+
+    partitioning = flavor(schema=partitioning_schema)
+
+    # test forward transformation (format)
+    assert (
+        partitioning.format((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
+        == expected_defined_partition
+    )
+
+    # test backward transformation (parse)
+    assert partitioning.parse("/".join(expected_defined_partition)).equals(
+        (pc.field("foo") == "A") & (pc.field("bar") == "ant bee")
+    )
+
+    # test complex expression can still be parsed into useful directory/path
+    assert (
+        partitioning.format(
+            ((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
+            & ((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
+        )
+        == expected_defined_partition
+    )
+
+    # test a different complex expression cannot be parsed into directory/path
+    # and just returns the same value as if no filter were applied.
+    assert (
+        partitioning.format(
+            ((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
+            | ((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
+        )
+        == expected_undefined_partition
+    )
+
+    if flavor != ds.HivePartitioning:
+        # Raises error upon filtering for lower level partition without filtering for
+        # higher level partition
+        with pytest.raises(
+            pa.ArrowInvalid,
+            match=(
+                "No partition key for foo but a key was provided"
+                " subsequently for bar"
+            )
+        ):
+            partitioning.format(((pc.field("bar") == "ant bee")))
+    else:
+        # Hive partitioning allows this to pass
+        assert partitioning.format(((pc.field("bar") == "ant bee"))) == (
+            r"bar=ant%20bee",
+            "",
+        )
+
+
 def test_expression_arithmetic_operators():
     dataset = ds.dataset(pa.table({'a': [1, 2, 3], 'b': [2, 2, 2]}))
     a = ds.field("a")