From 68d8007da3f81260c438865f43c8860268af0423 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 9 Aug 2024 18:21:36 -0700 Subject: [PATCH 01/12] Initial commit --- python/cudf/cudf/core/column_accessor.py | 12 ++++-- python/cudf/cudf/core/groupby/groupby.py | 39 +++++++++++-------- python/cudf/cudf/tests/groupby/test_agg.py | 30 ++++++++++++++ .../cudf/cudf/tests/test_column_accessor.py | 4 ++ 4 files changed, 65 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 819d351b2c4..92b6e5dcefd 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,6 +4,7 @@ import itertools import sys +import warnings from collections import abc from functools import cached_property, reduce from typing import TYPE_CHECKING, Any, Callable, Mapping @@ -606,7 +607,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any: return key + (pad_value,) * (self.nlevels - len(key)) def rename_levels( - self, mapper: Mapping[Any, Any] | Callable, level: int | None + self, mapper: Mapping[Any, Any] | Callable, level: int | None = None ) -> ColumnAccessor: """ Rename the specified levels of the given ColumnAccessor @@ -649,10 +650,13 @@ def rename_column(x): return x if level is None: - raise NotImplementedError( - "Renaming columns with a MultiIndex and level=None is" - "not supported" + warnings.warn( + "Renaming columns with MultiIndex assuming level=0. " + "Specify the level keyword argument to rename using " + "a different level eg. df.rename(..., level=1)", + UserWarning, ) + level = 0 new_col_names = (rename_column(k) for k in self.keys()) else: diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 92c4b73ceaa..f2bcd0d4af3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -548,7 +548,7 @@ def _groupby(self): ) @_performance_tracking - def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): """ Apply aggregation(s) to the groups. @@ -648,11 +648,10 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): raise NotImplementedError( "Passing args to func is currently not supported." ) - if kwargs: - raise NotImplementedError( - "Passing kwargs to func is currently not supported." - ) - column_names, columns, normalized_aggs = self._normalize_aggs(func) + + column_names, columns, normalized_aggs = self._normalize_aggs( + func, **kwargs + ) orig_dtypes = tuple(c.dtype for c in columns) # Note: When there are no key columns, the below produces @@ -1266,11 +1265,11 @@ def _grouped(self, *, include_groups: bool = True): return (group_names, offsets, grouped_keys, grouped_values) def _normalize_aggs( - self, aggs: MultiColumnAggType + self, aggs: MultiColumnAggType, **kwargs ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]: """ Normalize aggs to a list of list of aggregations, where `out[i]` - is a list of aggregations for column `self.obj[i]`. We support three + is a list of aggregations for column `self.obj[i]`. We support four different form of `aggs` input here: - A single agg, such as "sum". This agg is applied to all value columns. @@ -1279,18 +1278,26 @@ def _normalize_aggs( - A mapping of column name to aggs, such as {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified column. + - Pairs of column name and agg tuples passed as kwargs + eg. col1=("a", "sum"), col2=("b", "prod"). The output column names are + the keys. The aggs are applied to the corresponding column in the tuple. Each agg can be string or lambda functions. """ aggs_per_column: Iterable[AggType | Iterable[AggType]] - if isinstance(aggs, dict): - column_names, aggs_per_column = aggs.keys(), aggs.values() - columns = tuple(self.obj._data[col] for col in column_names) - else: - values = self.grouping.values - column_names = values._column_names - columns = values._columns - aggs_per_column = (aggs,) * len(columns) + if aggs: + if isinstance(aggs, dict): + column_names, aggs_per_column = aggs.keys(), aggs.values() + columns = tuple(self.obj._data[col] for col in column_names) + else: + values = self.grouping.values + column_names = values._column_names + columns = values._columns + aggs_per_column = (aggs,) * len(columns) + elif not aggs and kwargs: + column_names, aggs_per_column = kwargs.keys(), kwargs.values() + columns = tuple(self.obj._data[x[1][0]] for x in kwargs.items()) + aggs_per_column = [x[1] for x in kwargs.values()] # is_list_like performs type narrowing but type-checkers don't # know it. One could add a TypeGuard annotation to diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index f8847f02d5a..2351c68a02e 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. import numpy as np +import pandas as pd import pytest import cudf @@ -26,3 +27,32 @@ def test_series_agg(attr): pd_agg = getattr(pdf.groupby(["a"])["a"], attr)("count") assert agg.ndim == pd_agg.ndim + + +@pytest.mark.parametrize("func", ["sum", "prod", "mean", "count"]) +@pytest.mark.parametrize("attr", ["agg", "aggregate"]) +def test_dataframe_agg(attr, func): + df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) + pdf = df.to_pandas() + + agg = getattr(df.groupby("a"), attr)(func) + pd_agg = getattr(pdf.groupby(["a"]), attr)(func) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + + agg = getattr(df.groupby("a"), attr)({"b": func}) + pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func}) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + + agg = getattr(df.groupby("a"), attr)([func]) + pd_agg = getattr(pdf.groupby(["a"]), attr)([func]) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + + agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func)) + pd_agg = getattr(pdf.groupby(["a"]), attr)( + foo=("b", func), bar=("a", func) + ) + + pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index e84e1433c10..246f5fe2cad 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -362,6 +362,10 @@ def test_replace_level_values_MultiColumn(): got = ca.rename_levels(mapper={"a": "f"}, level=0) check_ca_equal(expect, got) + with pytest.raises(UserWarning): + got = ca.rename_levels(mapper={"a": "f"}) + check_ca_equal(expect, got) + def test_clear_nrows_empty_before(): ca = ColumnAccessor({}) From fada974ad92adeaab1275934d0b376048b0826c4 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Sun, 11 Aug 2024 12:24:47 -0700 Subject: [PATCH 02/12] proivide >= 1 agg --- python/cudf/cudf/core/groupby/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f2bcd0d4af3..55dab17ce65 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1298,6 +1298,8 @@ def _normalize_aggs( column_names, aggs_per_column = kwargs.keys(), kwargs.values() columns = tuple(self.obj._data[x[1][0]] for x in kwargs.items()) aggs_per_column = [x[1] for x in kwargs.values()] + else: + raise TypeError("Must provide at least one aggregation function.") # is_list_like performs type narrowing but type-checkers don't # know it. One could add a TypeGuard annotation to From a80bb28c8c67f9ff1d54e1f9e81c24d4c7e9fdc2 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 12 Aug 2024 04:51:26 -0700 Subject: [PATCH 03/12] remove test_groupy_all_columns --- .../dask_cudf/dask_cudf/tests/test_groupby.py | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index cf916b713b2..94c4cdd3361 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -774,35 +774,35 @@ def test_groupby_nested_dict(func): dd.assert_eq(a, b) -@pytest.mark.parametrize( - "func", - [ - lambda df: df.groupby(["x", "y"]).min(), - pytest.param( - lambda df: df.groupby(["x", "y"]).agg("min"), - marks=pytest.mark.skip( - reason="https://github.com/dask/dask/issues/9093" - ), - ), - lambda df: df.groupby(["x", "y"]).y.min(), - lambda df: df.groupby(["x", "y"]).y.agg("min"), - ], -) -def test_groupby_all_columns(func): - pdf = pd.DataFrame( - { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), - } - ) - - ddf = dd.from_pandas(pdf, npartitions=5) - gddf = ddf.to_backend("cudf") - - expect = func(ddf) - actual = func(gddf) - - dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) +# @pytest.mark.parametrize( +# "func", +# [ +# lambda df: df.groupby(["x", "y"]).min(), +# pytest.param( +# lambda df: df.groupby(["x", "y"]).agg("min"), +# marks=pytest.mark.skip( +# reason="https://github.com/dask/dask/issues/9093" +# ), +# ), +# lambda df: df.groupby(["x", "y"]).y.min(), +# lambda df: df.groupby(["x", "y"]).y.agg("min"), +# ], +# ) +# def test_groupby_all_columns(func): +# pdf = pd.DataFrame( +# { +# "x": np.random.randint(0, 5, size=10000), +# "y": np.random.normal(size=10000), +# } +# ) + +# ddf = dd.from_pandas(pdf, npartitions=5) +# gddf = ddf.to_backend("cudf") + +# expect = func(ddf) +# actual = func(gddf) + +# dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) def test_groupby_shuffle(): From aa819c5a5ac5daccbd65a05a26411f644041db59 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 12 Aug 2024 05:41:52 -0700 Subject: [PATCH 04/12] refactor --- python/cudf/cudf/core/groupby/groupby.py | 2 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 58 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 55dab17ce65..8b318ca2ba3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1297,7 +1297,7 @@ def _normalize_aggs( elif not aggs and kwargs: column_names, aggs_per_column = kwargs.keys(), kwargs.values() columns = tuple(self.obj._data[x[1][0]] for x in kwargs.items()) - aggs_per_column = [x[1] for x in kwargs.values()] + aggs_per_column = tuple(x[1] for x in kwargs.values()) else: raise TypeError("Must provide at least one aggregation function.") diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 94c4cdd3361..cf916b713b2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -774,35 +774,35 @@ def test_groupby_nested_dict(func): dd.assert_eq(a, b) -# @pytest.mark.parametrize( -# "func", -# [ -# lambda df: df.groupby(["x", "y"]).min(), -# pytest.param( -# lambda df: df.groupby(["x", "y"]).agg("min"), -# marks=pytest.mark.skip( -# reason="https://github.com/dask/dask/issues/9093" -# ), -# ), -# lambda df: df.groupby(["x", "y"]).y.min(), -# lambda df: df.groupby(["x", "y"]).y.agg("min"), -# ], -# ) -# def test_groupby_all_columns(func): -# pdf = pd.DataFrame( -# { -# "x": np.random.randint(0, 5, size=10000), -# "y": np.random.normal(size=10000), -# } -# ) - -# ddf = dd.from_pandas(pdf, npartitions=5) -# gddf = ddf.to_backend("cudf") - -# expect = func(ddf) -# actual = func(gddf) - -# dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) +@pytest.mark.parametrize( + "func", + [ + lambda df: df.groupby(["x", "y"]).min(), + pytest.param( + lambda df: df.groupby(["x", "y"]).agg("min"), + marks=pytest.mark.skip( + reason="https://github.com/dask/dask/issues/9093" + ), + ), + lambda df: df.groupby(["x", "y"]).y.min(), + lambda df: df.groupby(["x", "y"]).y.agg("min"), + ], +) +def test_groupby_all_columns(func): + pdf = pd.DataFrame( + { + "x": np.random.randint(0, 5, size=10000), + "y": np.random.normal(size=10000), + } + ) + + ddf = dd.from_pandas(pdf, npartitions=5) + gddf = ddf.to_backend("cudf") + + expect = func(ddf) + actual = func(gddf) + + dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) def test_groupby_shuffle(): From 3f3f6da421e7c98cb2110771e52095bcaa7c0e05 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 12 Aug 2024 08:47:50 -0700 Subject: [PATCH 05/12] Add kwargs to dask_cudf groupby.agg --- python/dask_cudf/dask_cudf/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index bbbcde17b51..4b70550ffd7 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -196,7 +196,7 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_performance_tracking def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None + self, arg, split_every=None, split_out=1, shuffle_method=None, **kwargs ): if arg == "size": return self.size() @@ -341,7 +341,7 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_performance_tracking def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None + self, arg, split_every=None, split_out=1, shuffle_method=None, **kwargs ): if arg == "size": return self.size() From 2fcff3a3575dfbe1795e92e9d52dace903894f75 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 12 Aug 2024 12:24:01 -0700 Subject: [PATCH 06/12] comment out test case --- python/dask_cudf/dask_cudf/groupby.py | 4 ++-- python/dask_cudf/dask_cudf/tests/test_groupby.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 4b70550ffd7..bbbcde17b51 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -196,7 +196,7 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_performance_tracking def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None, **kwargs + self, arg, split_every=None, split_out=1, shuffle_method=None ): if arg == "size": return self.size() @@ -341,7 +341,7 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_performance_tracking def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None, **kwargs + self, arg, split_every=None, split_out=1, shuffle_method=None ): if arg == "size": return self.size() diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index cf916b713b2..d67d8b84c56 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -778,12 +778,12 @@ def test_groupby_nested_dict(func): "func", [ lambda df: df.groupby(["x", "y"]).min(), - pytest.param( - lambda df: df.groupby(["x", "y"]).agg("min"), - marks=pytest.mark.skip( - reason="https://github.com/dask/dask/issues/9093" - ), - ), + # pytest.param( + # lambda df: df.groupby(["x", "y"]).agg("min"), + # marks=pytest.mark.skip( + # reason="https://github.com/dask/dask/issues/9093" + # ), + # ), lambda df: df.groupby(["x", "y"]).y.min(), lambda df: df.groupby(["x", "y"]).y.agg("min"), ], From 9deb3c980612163dce158f8269179bf7c7d94e41 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 12 Aug 2024 19:38:40 -0700 Subject: [PATCH 07/12] add isinstance check for dict --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/dask_cudf/dask_cudf/tests/test_groupby.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8b318ca2ba3..38bf91f1fd2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1285,7 +1285,7 @@ def _normalize_aggs( """ aggs_per_column: Iterable[AggType | Iterable[AggType]] - if aggs: + if aggs or isinstance(aggs, dict): if isinstance(aggs, dict): column_names, aggs_per_column = aggs.keys(), aggs.values() columns = tuple(self.obj._data[col] for col in column_names) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index d67d8b84c56..00840f6157f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -784,8 +784,8 @@ def test_groupby_nested_dict(func): # reason="https://github.com/dask/dask/issues/9093" # ), # ), - lambda df: df.groupby(["x", "y"]).y.min(), - lambda df: df.groupby(["x", "y"]).y.agg("min"), + # lambda df: df.groupby(["x", "y"]).y.min(), + # lambda df: df.groupby(["x", "y"]).y.agg("min"), ], ) def test_groupby_all_columns(func): From 8cf926d2ce2682152c2c971aaf614293d3c74320 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 13 Aug 2024 14:47:57 -0700 Subject: [PATCH 08/12] add todo --- python/cudf/cudf/core/groupby/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 38bf91f1fd2..691062dfd7e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1285,6 +1285,8 @@ def _normalize_aggs( """ aggs_per_column: Iterable[AggType | Iterable[AggType]] + # TODO: Remove isinstance condition when the legacy dask_cudf API is removed. + # See https://github.com/rapidsai/cudf/pull/16528#discussion_r1715482302 for information. if aggs or isinstance(aggs, dict): if isinstance(aggs, dict): column_names, aggs_per_column = aggs.keys(), aggs.values() From dbb63981c4baf3468fcd0e9f8d00bde63ba39d22 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 13 Aug 2024 14:52:17 -0700 Subject: [PATCH 09/12] uncomment test cases --- python/dask_cudf/dask_cudf/tests/test_groupby.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 00840f6157f..cf916b713b2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -778,14 +778,14 @@ def test_groupby_nested_dict(func): "func", [ lambda df: df.groupby(["x", "y"]).min(), - # pytest.param( - # lambda df: df.groupby(["x", "y"]).agg("min"), - # marks=pytest.mark.skip( - # reason="https://github.com/dask/dask/issues/9093" - # ), - # ), - # lambda df: df.groupby(["x", "y"]).y.min(), - # lambda df: df.groupby(["x", "y"]).y.agg("min"), + pytest.param( + lambda df: df.groupby(["x", "y"]).agg("min"), + marks=pytest.mark.skip( + reason="https://github.com/dask/dask/issues/9093" + ), + ), + lambda df: df.groupby(["x", "y"]).y.min(), + lambda df: df.groupby(["x", "y"]).y.agg("min"), ], ) def test_groupby_all_columns(func): From 107bfc749d1d498482eff19b4bc707a683d298d3 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 14 Aug 2024 05:48:56 -0700 Subject: [PATCH 10/12] Address review --- python/cudf/cudf/core/column_accessor.py | 7 ------- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/tests/groupby/test_agg.py | 10 +++++----- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 2346695984a..48bc84070b1 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,7 +4,6 @@ import itertools import sys -import warnings from collections import abc from functools import cached_property, reduce from typing import TYPE_CHECKING, Any, Callable, Mapping @@ -654,12 +653,6 @@ def rename_column(x): return x if level is None: - warnings.warn( - "Renaming columns with MultiIndex assuming level=0. " - "Specify the level keyword argument to rename using " - "a different level eg. df.rename(..., level=1)", - UserWarning, - ) level = 0 new_col_names = (rename_column(k) for k in self.keys()) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 691062dfd7e..9b71ea57f1f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1298,7 +1298,7 @@ def _normalize_aggs( aggs_per_column = (aggs,) * len(columns) elif not aggs and kwargs: column_names, aggs_per_column = kwargs.keys(), kwargs.values() - columns = tuple(self.obj._data[x[1][0]] for x in kwargs.items()) + columns = tuple(self.obj._data[x[0]] for x in kwargs.values()) aggs_per_column = tuple(x[1] for x in kwargs.values()) else: raise TypeError("Must provide at least one aggregation function.") diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index 2351c68a02e..99e7523031b 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -1,9 +1,9 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. import numpy as np -import pandas as pd import pytest import cudf +from cudf.testing import assert_eq @pytest.mark.parametrize( @@ -38,21 +38,21 @@ def test_dataframe_agg(attr, func): agg = getattr(df.groupby("a"), attr)(func) pd_agg = getattr(pdf.groupby(["a"]), attr)(func) - pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + assert_eq(agg, pd_agg) agg = getattr(df.groupby("a"), attr)({"b": func}) pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func}) - pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + assert_eq(agg, pd_agg) agg = getattr(df.groupby("a"), attr)([func]) pd_agg = getattr(pdf.groupby(["a"]), attr)([func]) - pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + assert_eq(agg, pd_agg) agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func)) pd_agg = getattr(pdf.groupby(["a"]), attr)( foo=("b", func), bar=("a", func) ) - pd.testing.assert_frame_equal(agg.to_pandas(), pd_agg) + assert_eq(agg, pd_agg) From 7ee6fa2f4185bbcd640db1efe40a160a0aa6c052 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 14 Aug 2024 09:47:07 -0700 Subject: [PATCH 11/12] forgot to remote expected warning --- python/cudf/cudf/tests/test_column_accessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 246f5fe2cad..2d7bc809d4d 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -362,9 +362,9 @@ def test_replace_level_values_MultiColumn(): got = ca.rename_levels(mapper={"a": "f"}, level=0) check_ca_equal(expect, got) - with pytest.raises(UserWarning): - got = ca.rename_levels(mapper={"a": "f"}) - check_ca_equal(expect, got) + # passing without level kwarg assumes level=0 + got = ca.rename_levels(mapper={"a": "f"}) + check_ca_equal(expect, got) def test_clear_nrows_empty_before(): From 97f0ce6b6e6dc3628955f32147ddcbdab8828f64 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 14 Aug 2024 10:54:02 -0700 Subject: [PATCH 12/12] xfail test passes now --- python/cudf/cudf/tests/test_dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2c59253d500..89eb5a12c71 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9409,7 +9409,6 @@ def test_rename_for_level_RangeIndex_dataframe(): assert_eq(expect, got) -@pytest_xfail(reason="level=None not implemented yet") def test_rename_for_level_is_None_MC(): gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])