From 789a02eef9524ddb154443cc11ce229ac08085f2 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 7 Nov 2022 10:24:45 -0600 Subject: [PATCH 01/12] FIX-#5200: Use squeeze parameter instead of SeriesGroupby Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index dcd990f13b2..8d6f6c2b628 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -425,13 +425,15 @@ def __getitem__(self, key): if is_list_like(key): make_dataframe = True else: + key = [key] if self._as_index: make_dataframe = False else: make_dataframe = True - key = [key] + internal_by = frozenset(self._internal_by) + cols_to_grab = internal_by.union(key) + key = [col for col in self._df.columns if col in cols_to_grab] if make_dataframe: - internal_by = frozenset(self._internal_by) if len(internal_by.intersection(key)) != 0: ErrorMessage.missmatch_with_pandas( operation="GroupBy.__getitem__", @@ -443,8 +445,6 @@ def __getitem__(self, key): + "df.groupby(df['by_column'].copy())['by_column']" ), ) - cols_to_grab = internal_by.union(key) - key = [col for col in self._df.columns if col in cols_to_grab] return DataFrameGroupBy( self._df[key], drop=self._drop, @@ -459,7 +459,8 @@ def __getitem__(self, key): "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported." ) - return SeriesGroupBy( + kwargs["squeeze"] = True + return DataFrameGroupBy( self._df[key], drop=False, **kwargs, From e48fdd4b980742c978a454dd17d7ce10f291b591 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 17 Nov 2022 14:36:30 -0600 Subject: [PATCH 02/12] Move code Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 8d6f6c2b628..d18e1509c70 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -459,8 +459,7 @@ def __getitem__(self, key): "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported." ) - kwargs["squeeze"] = True - return DataFrameGroupBy( + return SeriesGroupBy( self._df[key], drop=False, **kwargs, @@ -1195,6 +1194,24 @@ def groupby_on_multiple_columns(df, *args, **kwargs): @_inherit_docstrings(pandas.core.groupby.SeriesGroupBy) class SeriesGroupBy(SeriesGroupByCompat, DataFrameGroupBy): + + def __init__( + self, + df, + by, + axis, + level, + as_index, + sort, + group_keys, + squeeze, + idx_name, + drop, + **kwargs, + ): + super(SeriesGroupBy, self).__init__(df, by, axis, level, as_index, sort, group_keys, squeeze, idx_name, drop, **kwargs) + self._squeeze = True + @property def ndim(self): """ From 268f9da8dfc52c0e5575c1117b189c92951dde2d Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 22 Nov 2022 11:48:12 -0600 Subject: [PATCH 03/12] Lint Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index d18e1509c70..48df124c32b 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1194,9 +1194,21 @@ def groupby_on_multiple_columns(df, *args, **kwargs): @_inherit_docstrings(pandas.core.groupby.SeriesGroupBy) class SeriesGroupBy(SeriesGroupByCompat, DataFrameGroupBy): - def __init__( - self, + self, + df, + by, + axis, + level, + as_index, + sort, + group_keys, + squeeze, + idx_name, + drop, + **kwargs, + ): + super(SeriesGroupBy, self).__init__( df, by, axis, @@ -1208,8 +1220,7 @@ def __init__( idx_name, drop, **kwargs, - ): - super(SeriesGroupBy, self).__init__(df, by, axis, level, as_index, sort, group_keys, squeeze, idx_name, drop, **kwargs) + ) self._squeeze = True @property From 1c9ff0ab67695f770324155339dc8866a5c05970 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 28 Nov 2022 10:50:36 -0600 Subject: [PATCH 04/12] Fix the bugs Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 16 +++++++++++++++- modin/pandas/test/utils.py | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 48df124c32b..e1acc608cea 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -461,7 +461,7 @@ def __getitem__(self, key): ) return SeriesGroupBy( self._df[key], - drop=False, + drop=True, **kwargs, ) @@ -1223,6 +1223,12 @@ def __init__( ) self._squeeze = True + def _default_to_pandas(self, f, *args, **kwargs): + intermediate = super(SeriesGroupBy, self)._default_to_pandas(f, *args, **kwargs) + if not isinstance(intermediate, Series) and self._squeeze: + return intermediate.squeeze(axis=1) + return intermediate + @property def ndim(self): """ @@ -1276,6 +1282,14 @@ def _iter(self): for k in (sorted(group_ids) if self._sort else group_ids) ) + def aggregate(self, func=None, *args, **kwargs): + if isinstance(func, (list, dict)): + self._squeeze = False + result = super(SeriesGroupBy, self).aggregate(func, *args, **kwargs) + self._squeeze = True + return result + + agg = aggregate if IsExperimental.get(): from modin.experimental.cloud.meta_magic import make_wrapped_class diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 21565286309..9287607f8a8 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -615,6 +615,7 @@ def df_equals(df1, df2): elif isinstance(df1, np.recarray) and isinstance(df2, np.recarray): np.testing.assert_array_equal(df1, df2) else: + raise ValueError(f"type {type(df1)} not equal type {type(df2)}\n\n{df1}\n\n{df2}") if df1 != df2: np.testing.assert_almost_equal(df1, df2) From d7c621e29cacf34fa24bcd176d1db216350ed873 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 28 Nov 2022 10:53:49 -0600 Subject: [PATCH 05/12] lint Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 1 + modin/pandas/test/utils.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index e1acc608cea..b143c9b7536 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1291,6 +1291,7 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate + if IsExperimental.get(): from modin.experimental.cloud.meta_magic import make_wrapped_class diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 156c1860a90..412ebed29bd 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -626,7 +626,6 @@ def df_equals(df1, df2): elif isinstance(df1, np.recarray) and isinstance(df2, np.recarray): np.testing.assert_array_equal(df1, df2) else: - raise ValueError(f"type {type(df1)} not equal type {type(df2)}\n\n{df1}\n\n{df2}") if df1 != df2: np.testing.assert_almost_equal(df1, df2) From 79d57c723d2c01e45eb32c445ab9920eee00bac1 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 28 Nov 2022 10:59:38 -0600 Subject: [PATCH 06/12] Add docstring Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index b143c9b7536..a09d8892c17 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1224,6 +1224,23 @@ def __init__( self._squeeze = True def _default_to_pandas(self, f, *args, **kwargs): + """ + Execute function `f` in default-to-pandas way. + + Parameters + ---------- + f : callable + The function to apply to each group. + *args : list + Extra positional arguments to pass to `f`. + **kwargs : dict + Extra keyword arguments to pass to `f`. + + Returns + ------- + modin.pandas.Series or modin.pandas.DataFrame + A new Modin Series or DataFrame with the result of the pandas function. + """ intermediate = super(SeriesGroupBy, self)._default_to_pandas(f, *args, **kwargs) if not isinstance(intermediate, Series) and self._squeeze: return intermediate.squeeze(axis=1) From ab4948ab4d924e0521155075995f0ea9e11e68d9 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 28 Nov 2022 13:56:01 -0600 Subject: [PATCH 07/12] Fix bug Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index a09d8892c17..55bdffd1bb4 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1243,7 +1243,7 @@ def _default_to_pandas(self, f, *args, **kwargs): """ intermediate = super(SeriesGroupBy, self)._default_to_pandas(f, *args, **kwargs) if not isinstance(intermediate, Series) and self._squeeze: - return intermediate.squeeze(axis=1) + return intermediate.squeeze(axis=self._axis ^ 1) return intermediate @property From 1e59dd58e9a674ee08b40f803d4a713f4e177e1f Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 30 Nov 2022 10:32:36 -0600 Subject: [PATCH 08/12] Fix squeeze issue Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 55bdffd1bb4..de5d01d9db9 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1241,8 +1241,11 @@ def _default_to_pandas(self, f, *args, **kwargs): modin.pandas.Series or modin.pandas.DataFrame A new Modin Series or DataFrame with the result of the pandas function. """ + squeeze = self._squeeze + self._squeeze = False intermediate = super(SeriesGroupBy, self)._default_to_pandas(f, *args, **kwargs) - if not isinstance(intermediate, Series) and self._squeeze: + self._squeeze = squeeze + if not isinstance(intermediate, Series) and squeeze: return intermediate.squeeze(axis=self._axis ^ 1) return intermediate From e436f3e4cffc2282df7aa0cf1121b041df9845b9 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 30 Nov 2022 10:47:39 -0600 Subject: [PATCH 09/12] Fix Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index de5d01d9db9..df1e1c7c47c 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -687,8 +687,6 @@ def size(self): if MODIN_UNNAMED_SERIES_LABEL in result.columns else result ) - elif isinstance(self._df, Series): - result.name = self._df.name else: result.name = None return result.fillna(0) @@ -1265,6 +1263,14 @@ def ndim(self): """ return 1 # ndim is always 1 for Series + def size(self): + intermediate = super(SeriesGroupBy, self).size() + if isinstance(self._df, Series): + intermediate.name = self._df.name + else: + intermediate.name = next(col_name for col_name in self._df.columns if col_name not in self._internal_by) + return intermediate + @property def _iter(self): """ From d0c8b932738d3af21f353193c9e514b98a36b688 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 30 Nov 2022 11:19:46 -0600 Subject: [PATCH 10/12] Lint Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index df1e1c7c47c..c26e28b66e3 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1268,7 +1268,11 @@ def size(self): if isinstance(self._df, Series): intermediate.name = self._df.name else: - intermediate.name = next(col_name for col_name in self._df.columns if col_name not in self._internal_by) + intermediate.name = next( + col_name + for col_name in self._df.columns + if col_name not in self._internal_by + ) return intermediate @property From 982847108ddbe46ac1342bc0d5d4b595ab8fc458 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 30 Nov 2022 12:43:00 -0600 Subject: [PATCH 11/12] Fixes Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index c26e28b66e3..3674141321b 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1239,12 +1239,14 @@ def _default_to_pandas(self, f, *args, **kwargs): modin.pandas.Series or modin.pandas.DataFrame A new Modin Series or DataFrame with the result of the pandas function. """ - squeeze = self._squeeze - self._squeeze = False + self._df = self._df[ + next( + col_name + for col_name in self._df.columns + if col_name not in self._internal_by + ) + ] intermediate = super(SeriesGroupBy, self)._default_to_pandas(f, *args, **kwargs) - self._squeeze = squeeze - if not isinstance(intermediate, Series) and squeeze: - return intermediate.squeeze(axis=self._axis ^ 1) return intermediate @property From f1981652fa517a861554b3a0c1a7bea40542f5e3 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 30 Nov 2022 12:45:28 -0600 Subject: [PATCH 12/12] Cache old df Signed-off-by: Devin Petersohn --- modin/pandas/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 3674141321b..a2c447da777 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1239,6 +1239,7 @@ def _default_to_pandas(self, f, *args, **kwargs): modin.pandas.Series or modin.pandas.DataFrame A new Modin Series or DataFrame with the result of the pandas function. """ + old_df = self._df self._df = self._df[ next( col_name @@ -1247,6 +1248,7 @@ def _default_to_pandas(self, f, *args, **kwargs): ) ] intermediate = super(SeriesGroupBy, self)._default_to_pandas(f, *args, **kwargs) + self._df = old_df return intermediate @property