Skip to content

Commit

Permalink
GH-43684: [Python][Dataset] Python / Cython interface to C++ arrow::d…
Browse files Browse the repository at this point in the history
…ataset::Partitioning::Format (#43740)

See
#43684
* GitHub Issue: #43684

Lead-authored-by: feiyang <[email protected]>
Co-authored-by: Feiyang472 <[email protected]>
Co-authored-by: Bryce Mecum <[email protected]>
Signed-off-by: Bryce Mecum <[email protected]>
  • Loading branch information
Feiyang472 and amoeba authored Oct 8, 2024
1 parent 505e3e3 commit 44fb439
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 0 deletions.
37 changes: 37 additions & 0 deletions python/pyarrow/_dataset.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2505,6 +2505,43 @@ cdef class Partitioning(_Weakrefable):
result = self.partitioning.Parse(tobytes(path))
return Expression.wrap(GetResultValue(result))

def format(self, expr):
"""
Convert a filter expression into a tuple of (directory, filename) using
the current partitioning scheme
Parameters
----------
expr : pyarrow.dataset.Expression
Returns
-------
tuple[str, str]
Examples
--------
Specify the Schema for paths like "/2009/June":
>>> import pyarrow as pa
>>> import pyarrow.dataset as ds
>>> import pyarrow.compute as pc
>>> part = ds.partitioning(pa.schema([("year", pa.int16()),
... ("month", pa.string())]))
>>> part.format(
... (pc.field("year") == 1862) & (pc.field("month") == "Jan")
... )
('1862/Jan', '')
"""
cdef:
CPartitionPathFormat result

result = GetResultValue(self.partitioning.Format(
Expression.unwrap(expr)
))

return frombytes(result.directory), frombytes(result.filename)

@property
def schema(self):
"""The arrow Schema attached to the partitioning."""
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/includes/libarrow_dataset.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
CJSONParseOptions parse_options
CJSONReadOptions read_options

cdef struct CPartitionPathFormat "arrow::dataset::PartitionPathFormat":
c_string directory
c_string filename

cdef cppclass CPartitioning "arrow::dataset::Partitioning":
c_string type_name() const
CResult[CExpression] Parse(const c_string & path) const
CResult[CPartitionPathFormat] Format(const CExpression & expr) const
const shared_ptr[CSchema] & schema()
c_bool Equals(const CPartitioning& other) const

Expand Down
67 changes: 67 additions & 0 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,73 @@ def test_partitioning_pickling(pickle_module):
assert pickle_module.loads(pickle_module.dumps(part)) == part


@pytest.mark.parametrize(
"flavor, expected_defined_partition, expected_undefined_partition",
[
(ds.HivePartitioning, (r"foo=A/bar=ant%20bee", ""), ("", "")),
(ds.DirectoryPartitioning, (r"A/ant bee", ""), ("", "")),
(ds.FilenamePartitioning, ("", r"A_ant bee_"), ("", "_")),
],
)
def test_dataset_partitioning_format(
flavor: "ds.Partitioning",
expected_defined_partition: tuple,
expected_undefined_partition: tuple,
):

partitioning_schema = pa.schema([("foo", pa.string()), ("bar", pa.string())])

partitioning = flavor(schema=partitioning_schema)

# test forward transformation (format)
assert (
partitioning.format((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
== expected_defined_partition
)

# test backward transformation (parse)
assert partitioning.parse("/".join(expected_defined_partition)).equals(
(pc.field("foo") == "A") & (pc.field("bar") == "ant bee")
)

# test complex expression can still be parsed into useful directory/path
assert (
partitioning.format(
((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
& ((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
)
== expected_defined_partition
)

# test a different complex expression cannot be parsed into directory/path
# and just returns the same value as if no filter were applied.
assert (
partitioning.format(
((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
| ((pc.field("bar") == "ant bee") & (pc.field("foo") == "A"))
)
== expected_undefined_partition
)

if flavor != ds.HivePartitioning:
# Raises error upon filtering for lower level partition without filtering for
# higher level partition
with pytest.raises(
pa.ArrowInvalid,
match=(
"No partition key for foo but a key was provided"
" subsequently for bar"
)
):
partitioning.format(((pc.field("bar") == "ant bee")))
else:
# Hive partitioning allows this to pass
assert partitioning.format(((pc.field("bar") == "ant bee"))) == (
r"bar=ant%20bee",
"",
)


def test_expression_arithmetic_operators():
dataset = ds.dataset(pa.table({'a': [1, 2, 3], 'b': [2, 2, 2]}))
a = ds.field("a")
Expand Down

0 comments on commit 44fb439

Please sign in to comment.