-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH/BUG: Use Kleene logic for groupby any/all #40819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 31 commits
088ca14
2554921
9a8f9c9
5ca9c4b
68fd995
6530491
26146c2
20f475d
924b38e
423f43f
b1408ac
47ef037
4415060
bb04c1c
9c90886
ef3fbe2
f4c8a8a
1c3cb7d
7cbf85b
809b8a4
58fd33a
80a65bb
c9b9d5f
7514568
740ad7b
a116bed
8a428d4
8e3c5be
b627618
23b3b64
a30496c
3051a99
4cd2833
98cd401
a92c637
7c5c8e6
c66d1fd
0950234
c81c1a5
d2b8ad0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import numpy as np | ||
import pytest | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
DataFrame, | ||
Index, | ||
|
@@ -68,3 +69,91 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): | |
|
||
expected = df | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) | ||
@pytest.mark.parametrize("skipna", [True, False]) | ||
@pytest.mark.parametrize( | ||
# expected_data indexed as [[skipna=False/any, skipna=False/all], | ||
# [skipna=True/any, skipna=True/all]] | ||
"data,expected_data", | ||
[ | ||
([False, False, False], [[False, False], [False, False]]), | ||
([True, True, True], [[True, True], [True, True]]), | ||
([pd.NA, pd.NA, pd.NA], [[pd.NA, pd.NA], [False, True]]), | ||
([False, pd.NA, False], [[pd.NA, False], [False, False]]), | ||
([True, pd.NA, True], [[True, pd.NA], [True, True]]), | ||
([True, pd.NA, False], [[True, False], [True, False]]), | ||
], | ||
) | ||
def test_masked_kleene_logic(bool_agg_func, data, expected_data, skipna): | ||
# GH#37506 | ||
df = DataFrame(data, dtype="boolean") | ||
expected = DataFrame( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this expected is really hard to parse can you do in multiple steps / make simpler There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressing your comment above makes this simpler, let me know if you think any piece is still confusing |
||
[expected_data[skipna][bool_agg_func == "all"]], dtype="boolean", index=[1] | ||
) | ||
|
||
result = df.groupby([1, 1, 1]).agg(bool_agg_func, skipna=skipna) | ||
tm.assert_frame_equal(result, expected) | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# The expected result we compared to should match aggregating on the whole | ||
# series | ||
result = getattr(df[0], bool_agg_func)(skipna=skipna) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this check is really hard to parse. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What this is essentially doing is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is a complicated expexcted then a really complicated assert. This needs to be greatly simplified. It is impossible to grok with all of this logic. my suggesetion is to break this up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be very concrete, you mean breaking it up into 2 lines, like
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have tried to simplify by moving this validation of "expected" into a separate test in |
||
expected = expected_data[skipna][bool_agg_func == "all"] | ||
assert (result is pd.NA and expected is pd.NA) or result == expected | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"dtype1,dtype2,exp_col1,exp_col2", | ||
[ | ||
( | ||
"float", | ||
"Float64", | ||
pd.array([True], dtype=bool), | ||
mzeitlin11 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pd.array([pd.NA], dtype="boolean"), | ||
), | ||
( | ||
"Int64", | ||
"float", | ||
pd.array([pd.NA], dtype="boolean"), | ||
pd.array([True], dtype=bool), | ||
mzeitlin11 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
), | ||
( | ||
"Int64", | ||
"Int64", | ||
pd.array([pd.NA], dtype="boolean"), | ||
pd.array([pd.NA], dtype="boolean"), | ||
), | ||
( | ||
"Float64", | ||
"boolean", | ||
pd.array([pd.NA], dtype="boolean"), | ||
pd.array([pd.NA], dtype="boolean"), | ||
), | ||
], | ||
) | ||
def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): | ||
# GH#37506 | ||
data = [1.0, np.nan] | ||
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
df = DataFrame( | ||
{"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} | ||
) | ||
result = df.groupby([1, 1]).agg("all", skipna=False) | ||
|
||
expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=[1]) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) | ||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) | ||
@pytest.mark.parametrize("skipna", [True, False]) | ||
def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): | ||
# GH#40585 | ||
obj = frame_or_series([pd.NA, 1], dtype=dtype) | ||
expected_res = True | ||
if not skipna and bool_agg_func == "all": | ||
expected_res = pd.NA | ||
expected = frame_or_series([expected_res], index=[1], dtype="boolean") | ||
|
||
result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) | ||
tm.assert_equal(result, expected) |
Uh oh!
There was an error while loading. Please reload this page.