From 68d8007da3f81260c438865f43c8860268af0423 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 9 Aug 2024 18:21:36 -0700 Subject: [PATCH] Initial commit --- python/cudf/cudf/core/column_accessor.py | 12 ++++-- python/cudf/cudf/core/groupby/groupby.py | 39 +++++++++++-------- python/cudf/cudf/tests/groupby/test_agg.py | 30 ++++++++++++++ .../cudf/cudf/tests/test_column_accessor.py | 4 ++ 4 files changed, 65 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 819d351b2c4..92b6e5dcefd 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,6 +4,7 @@ import itertools import sys +import warnings from collections import abc from functools import cached_property, reduce from typing import TYPE_CHECKING, Any, Callable, Mapping @@ -606,7 +607,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any: return key + (pad_value,) * (self.nlevels - len(key)) def rename_levels( - self, mapper: Mapping[Any, Any] | Callable, level: int | None + self, mapper: Mapping[Any, Any] | Callable, level: int | None = None ) -> ColumnAccessor: """ Rename the specified levels of the given ColumnAccessor @@ -649,10 +650,13 @@ def rename_column(x): return x if level is None: - raise NotImplementedError( - "Renaming columns with a MultiIndex and level=None is" - "not supported" + warnings.warn( + "Renaming columns with MultiIndex assuming level=0. " + "Specify the level keyword argument to rename using " + "a different level eg. df.rename(..., level=1)", + UserWarning, ) + level = 0 new_col_names = (rename_column(k) for k in self.keys()) else: diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 92c4b73ceaa..f2bcd0d4af3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -548,7 +548,7 @@ def _groupby(self): ) @_performance_tracking - def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): """ Apply aggregation(s) to the groups. @@ -648,11 +648,10 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): raise NotImplementedError( "Passing args to func is currently not supported." ) - if kwargs: - raise NotImplementedError( - "Passing kwargs to func is currently not supported." - ) - column_names, columns, normalized_aggs = self._normalize_aggs(func) + + column_names, columns, normalized_aggs = self._normalize_aggs( + func, **kwargs + ) orig_dtypes = tuple(c.dtype for c in columns) # Note: When there are no key columns, the below produces @@ -1266,11 +1265,11 @@ def _grouped(self, *, include_groups: bool = True): return (group_names, offsets, grouped_keys, grouped_values) def _normalize_aggs( - self, aggs: MultiColumnAggType + self, aggs: MultiColumnAggType, **kwargs ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]: """ Normalize aggs to a list of list of aggregations, where `out[i]` - is a list of aggregations for column `self.obj[i]`. We support three + is a list of aggregations for column `self.obj[i]`. We support four different form of `aggs` input here: - A single agg, such as "sum". This agg is applied to all value columns. @@ -1279,18 +1278,26 @@ def _normalize_aggs( - A mapping of column name to aggs, such as {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified column. + - Pairs of column name and agg tuples passed as kwargs + eg. col1=("a", "sum"), col2=("b", "prod"). The output column names are + the keys. The aggs are applied to the corresponding column in the tuple. Each agg can be string or lambda functions. """ aggs_per_column: Iterable[AggType | Iterable[AggType]] - if isinstance(aggs, dict): - column_names, aggs_per_column = aggs.keys(), aggs.values() - columns = tuple(self.obj._data[col] for col in column_names) - else: - values = self.grouping.values - column_names = values._column_names - columns = values._columns - aggs_per_column = (aggs,) * len(columns) + if aggs: + if isinstance(aggs, dict): + column_names, aggs_per_column = aggs.keys(), aggs.values() + columns = tuple(self.obj._data[col] for col in column_names) + else: + values = self.grouping.values + column_names = values._column_names + columns = values._columns + aggs_per_column = (aggs,) * len(columns) + elif not aggs and kwargs: + column_names, aggs_per_column = kwargs.keys(), kwargs.values() + columns = tuple(self.obj._data[x[1][0]] for x in kwargs.items()) + aggs_per_column = [x[1] for x in kwargs.values()] # is_list_like performs type narrowing but type-checkers don't # know it. One could add a TypeGuard annotation to diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index f8847f02d5a..2351c68a02e 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. import numpy as np +import pandas as pd import pytest import cudf @@ -26,3 +27,32 @@ def test_series_agg(attr): pd_agg = getattr(pdf.groupby(["a"])["a"], attr)("count") assert agg.ndim == pd_agg.ndim + + +@pytest.mark.parametrize("func", ["sum", "prod", "mean", "count"]) +@pytest.mark.parametrize("attr", ["agg", "aggregate"]) +def test_dataframe_agg(attr, func): + df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) + pdf = df.to_pandas() + + agg = getattr(df.groupby("a"), attr)(func) + pd_agg = getattr(pdf.groupby(["a"]), attr)(func) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + + agg = getattr(df.groupby("a"), attr)({"b": func}) + pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func}) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + + agg = getattr(df.groupby("a"), attr)([func]) + pd_agg = getattr(pdf.groupby(["a"]), attr)([func]) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + + agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func)) + pd_agg = getattr(pdf.groupby(["a"]), attr)( + foo=("b", func), bar=("a", func) + ) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index e84e1433c10..246f5fe2cad 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -362,6 +362,10 @@ def test_replace_level_values_MultiColumn(): got = ca.rename_levels(mapper={"a": "f"}, level=0) check_ca_equal(expect, got) + with pytest.raises(UserWarning): + got = ca.rename_levels(mapper={"a": "f"}) + check_ca_equal(expect, got) + def test_clear_nrows_empty_before(): ca = ColumnAccessor({})