Skip to content

fix MultiIndex.difference not working with PyArrow timestamps (#61382) ,and some formating fix #61388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3891,6 +3891,60 @@ def equal_levels(self, other: MultiIndex) -> bool:
# --------------------------------------------------------------------
# Set Methods

def difference(self, other, sort=None):
"""
Return a new MultiIndex with elements from the index not in `other`.

Parameters
----------
other : MultiIndex or array-like
sort : bool or None, default None
Whether to sort the resulting index.

Returns
-------
MultiIndex
"""
if not isinstance(other, MultiIndex):
other = MultiIndex.from_tuples(other, names=self.names)

# Convert 'other' to codes using self's levels
other_codes = []
for i, (lev, name) in enumerate(zip(self.levels, self.names)):
level_vals = other.get_level_values(i)
other_code = lev.get_indexer(level_vals)
other_codes.append(other_code)

# Create mask for elements not in 'other'
n = len(self)
mask = np.ones(n, dtype=bool)
engine = self._engine
for codes in zip(*other_codes):
try:
loc = engine.get_loc(tuple(codes))
if isinstance(loc, slice):
mask[loc] = False
elif isinstance(loc, np.ndarray):
mask &= ~loc
else:
mask[loc] = False
except KeyError:
pass

new_codes = [code[mask] for code in self.codes]
result = MultiIndex(
levels=self.levels,
codes=new_codes,
names=self.names,
verify_integrity=False,
)
if sort is None or sort is True:
try:
return result.sort_values()
except TypeError:
pass
return result

def _union(self, other, sort) -> MultiIndex:
other, result_names = self._convert_can_do_setop(other)
if other.has_duplicates:
Expand Down
22 changes: 6 additions & 16 deletions pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,21 +160,13 @@ def test_query_empty_string(self):
df.query("")

def test_query_duplicate_column_name(self, engine, parser):
df = DataFrame(
{
"A": range(3),
"B": range(3),
"C": range(3)
}
).rename(columns={"B": "A"})
df = DataFrame({"A": range(3), "B": range(3), "C": range(3)}).rename(
columns={"B": "A"}
)

res = df.query('C == 1', engine=engine, parser=parser)
res = df.query("C == 1", engine=engine, parser=parser)

expect = DataFrame(
[[1, 1, 1]],
columns=["A", "A", "C"],
index=[1]
)
expect = DataFrame([[1, 1, 1]], columns=["A", "A", "C"], index=[1])

tm.assert_frame_equal(res, expect)

Expand Down Expand Up @@ -1140,9 +1132,7 @@ def test_query_with_nested_special_character(self, parser, engine):
[">=", operator.ge],
],
)
def test_query_lex_compare_strings(
self, parser, engine, op, func
):
def test_query_lex_compare_strings(self, parser, engine, op, func):
a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
b = Series(np.arange(a.size))
df = DataFrame({"X": a, "Y": b})
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,39 @@ def test_difference(idx, sort):
first.difference([1, 2, 3, 4, 5], sort=sort)


def test_multiindex_difference_pyarrow_timestamp():
pa = pytest.importorskip("pyarrow")

df = (
DataFrame(
[(1, "1900-01-01", "a"), (2, "1900-01-01", "b")],
columns=["id", "date", "val"],
)
.astype(
{
"id": "int64[pyarrow]",
"date": "timestamp[ns][pyarrow]",
"val": "string[pyarrow]",
}
)
.set_index(["id", "date"])
)

idx = df.index
idx_val = idx[0]

# Assert the value exists in the original index
assert idx_val in idx

# Remove idx_val using difference()
new_idx = idx.difference([idx_val])

# Verify the result
assert len(new_idx) == 1
assert idx_val not in new_idx
assert new_idx.equals(MultiIndex.from_tuples([(2, pd.Timestamp("1900-01-01"))]))


def test_difference_sort_special():
# GH-24959
idx = MultiIndex.from_product([[1, 0], ["a", "b"]])
Expand Down
3 changes: 1 addition & 2 deletions scripts/check_for_inconsistent_pandas_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
from typing import NamedTuple

ERROR_MESSAGE = (
"{path}:{lineno}:{col_offset}: "
"Found both '{prefix}.{name}' and '{name}' in {path}"
"{path}:{lineno}:{col_offset}: Found both '{prefix}.{name}' and '{name}' in {path}"
)


Expand Down
1 change: 1 addition & 0 deletions scripts/check_test_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
NOTE: if this finds a false positive, you can add the comment `# not a test` to the
class or function definition. Though hopefully that shouldn't be necessary.
"""

from __future__ import annotations

import argparse
Expand Down
1 change: 1 addition & 0 deletions scripts/generate_pip_deps_from_conda.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
generated with this script:
$ python scripts/generate_pip_deps_from_conda.py --compare
"""

import argparse
import pathlib
import re
Expand Down
1 change: 1 addition & 0 deletions scripts/pandas_errors_documented.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

pre-commit run pandas-errors-documented --all-files
"""

from __future__ import annotations

import argparse
Expand Down
1 change: 1 addition & 0 deletions scripts/sort_whatsnew_note.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

pre-commit run sort-whatsnew-items --all-files
"""

from __future__ import annotations

import argparse
Expand Down
5 changes: 1 addition & 4 deletions scripts/tests/test_check_test_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@
0,
),
(
"class Foo: # not a test\n"
" pass\n"
"def test_foo():\n"
" Class.foo()\n",
"class Foo: # not a test\n pass\ndef test_foo():\n Class.foo()\n",
"",
0,
),
Expand Down
8 changes: 2 additions & 6 deletions scripts/tests/test_inconsistent_namespace_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,10 @@
)

BAD_FILE_0 = (
"from pandas import Categorical\n"
"cat_0 = Categorical()\n"
"cat_1 = pd.Categorical()"
"from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = pd.Categorical()"
)
BAD_FILE_1 = (
"from pandas import Categorical\n"
"cat_0 = pd.Categorical()\n"
"cat_1 = Categorical()"
"from pandas import Categorical\ncat_0 = pd.Categorical()\ncat_1 = Categorical()"
)
BAD_FILE_2 = (
"from pandas import Categorical\n"
Expand Down
20 changes: 9 additions & 11 deletions scripts/tests/test_validate_docstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def redundant_import(self, paramx=None, paramy=None) -> None:
--------
>>> import numpy as np
>>> import pandas as pd
>>> df = pd.DataFrame(np.ones((3, 3)),
... columns=('a', 'b', 'c'))
>>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c"))
>>> df.all(axis=1)
0 True
1 True
Expand All @@ -50,14 +49,14 @@ def unused_import(self) -> None:
Examples
--------
>>> import pandas as pdf
>>> df = pd.DataFrame(np.ones((3, 3)), columns=('a', 'b', 'c'))
>>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c"))
"""

def missing_whitespace_around_arithmetic_operator(self) -> None:
"""
Examples
--------
>>> 2+5
>>> 2 + 5
7
"""

Expand All @@ -66,14 +65,14 @@ def indentation_is_not_a_multiple_of_four(self) -> None:
Examples
--------
>>> if 2 + 5:
... pass
... pass
"""

def missing_whitespace_after_comma(self) -> None:
"""
Examples
--------
>>> df = pd.DataFrame(np.ones((3,3)),columns=('a','b', 'c'))
>>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c"))
"""

def write_array_like_with_hyphen_not_underscore(self) -> None:
Expand Down Expand Up @@ -227,13 +226,13 @@ def test_validate_all_ignore_errors(self, monkeypatch):
"errors": [
("ER01", "err desc"),
("ER02", "err desc"),
("ER03", "err desc")
("ER03", "err desc"),
],
"warnings": [],
"examples_errors": "",
"deprecated": True,
"file": "file1",
"file_line": "file_line1"
"file_line": "file_line1",
},
)
monkeypatch.setattr(
Expand Down Expand Up @@ -272,14 +271,13 @@ def test_validate_all_ignore_errors(self, monkeypatch):
None: {"ER03"},
"pandas.DataFrame.align": {"ER01"},
# ignoring an error that is not requested should be of no effect
"pandas.Index.all": {"ER03"}
}
"pandas.Index.all": {"ER03"},
},
)
# two functions * two not global ignored errors - one function ignored error
assert exit_status == 2 * 2 - 1



class TestApiItems:
@property
def api_doc(self):
Expand Down
Loading
Loading