-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: fix assert_frame_equal can be very slow #38202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
5970d41
0be8f99
fd5dc65
e8fe687
fbfbcdd
f778a7d
e0cf672
59a178d
82cb7a3
932ec28
bf18386
59bc277
3401d8d
62058bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1358,6 +1358,55 @@ def assert_series_equal( | |
""" | ||
__tracebackhide__ = True | ||
|
||
_assert_series_values_equal( | ||
left, | ||
right, | ||
check_dtype=check_dtype, | ||
check_series_type=check_series_type, | ||
check_less_precise=check_less_precise, | ||
check_names=check_names, | ||
check_exact=check_exact, | ||
check_datetimelike_compat=check_datetimelike_compat, | ||
check_categorical=check_categorical, | ||
check_category_order=check_category_order, | ||
check_freq=check_freq, | ||
check_flags=check_flags, | ||
rtol=rtol, | ||
atol=atol, | ||
obj=obj, | ||
) | ||
|
||
assert_index_equal( | ||
left.index, | ||
right.index, | ||
exact=check_index_type, | ||
check_names=check_names, | ||
check_exact=check_exact, | ||
check_categorical=check_categorical, | ||
rtol=rtol, | ||
atol=atol, | ||
obj=f"{obj}.index", | ||
) | ||
|
||
|
||
def _assert_series_values_equal( | ||
left, | ||
right, | ||
*, | ||
check_dtype=True, | ||
check_series_type=True, | ||
check_less_precise=no_default, | ||
check_names=True, | ||
check_exact=False, | ||
check_datetimelike_compat=False, | ||
check_categorical=True, | ||
check_category_order=True, | ||
check_freq=True, | ||
check_flags=True, | ||
rtol=1.0e-5, | ||
atol=1.0e-8, | ||
obj="Series", | ||
): | ||
if check_less_precise is not no_default: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
warnings.warn( | ||
"The 'check_less_precise' keyword in testing.assert_*_equal " | ||
|
@@ -1383,18 +1432,6 @@ def assert_series_equal( | |
if check_flags: | ||
assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" | ||
|
||
# index comparison | ||
assert_index_equal( | ||
left.index, | ||
right.index, | ||
exact=check_index_type, | ||
check_names=check_names, | ||
check_exact=check_exact, | ||
check_categorical=check_categorical, | ||
rtol=rtol, | ||
atol=atol, | ||
obj=f"{obj}.index", | ||
) | ||
if check_freq and isinstance(left.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): | ||
lidx = left.index | ||
ridx = right.index | ||
|
@@ -1690,11 +1727,10 @@ def assert_frame_equal( | |
assert col in right | ||
lcol = left.iloc[:, i] | ||
rcol = right.iloc[:, i] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. marginally faster to do |
||
assert_series_equal( | ||
_assert_series_values_equal( | ||
lcol, | ||
rcol, | ||
check_dtype=check_dtype, | ||
check_index_type=check_index_type, | ||
check_exact=check_exact, | ||
check_names=check_names, | ||
check_datetimelike_compat=check_datetimelike_compat, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from unittest.mock import patch | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think elsewhere we use pytest's monkeypatch fixture, does that not provide the call_count check? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not find this functionality there. |
||
|
||
import pytest | ||
|
||
import pandas as pd | ||
|
@@ -285,3 +287,15 @@ def test_allows_duplicate_labels(): | |
|
||
with pytest.raises(AssertionError, match="<Flags"): | ||
tm.assert_frame_equal(left, right) | ||
|
||
|
||
def test_assert_frame_equal_checks_index_exactly_twice(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i dont think this necessarily needs a test, maybe an asv. as long as there is a comment in tm.assert_frame_equal about why we are doing what we're doing that should be enough There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am removing the test because it relies on |
||
data = {"col1": [1, 2, 3], "col2": [4, 5, 6]} | ||
left = DataFrame(data, index=["a", "b", "c"]) | ||
right = DataFrame(data, index=["a", "b", "c"]) | ||
with patch("pandas._testing.assert_index_equal", return_value=None) as mock: | ||
tm.assert_frame_equal(left, right) | ||
# Expect exactly two calls: | ||
# 1. Index equality | ||
# 2. Columns equality | ||
assert mock.call_count == 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah let's revert this and use the check_index=False
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reverted to use of
check_index
.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But personally I prefer the way involving splitting of
assert_series_equal
into two functions.Indeed, this function does two major things: check index equivalence and check values equivalence, so it is reasonable to split it like that.
I see that there are multiple parameters passed to
_assert_series_values_equal
, which make this solution look not very elegant.But there is a benefit that we do not touch public API, by not adding new parameter
check_index
.Please suggest which solution you prefer (@jbrockmendel suggested that we do not introduce new kwarg).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the new kwarg isnt the worst thing in the world. lets make it keyword-only to start the process of #38222
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok i think this is fine (reason I don't mind
check_index
is that we have this elsewhere publicly). ok let's make this kwarg only fo r the new paramater, otherwise lgtm.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I make this kwarg keyword-only.