Skip to content

BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes #54964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@ Strings
Interval
^^^^^^^^
- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`)
- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
-

Indexing
Expand Down Expand Up @@ -321,6 +323,7 @@ Styler

Other
^^^^^
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)

.. ***DO NOT USE THIS SECTION***

Expand Down
6 changes: 1 addition & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3938,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool:
if isinstance(self.dtype, IntervalDtype):
if isinstance(target.dtype, IntervalDtype):
return False
# See https://github.com/pandas-dev/pandas/issues/47772 the commented
# out code can be restored (instead of hardcoding `return True`)
# once that issue is fixed
# "Index" has no attribute "left"
# return self.left._should_compare(target) # type: ignore[attr-defined]
return True
return self.left._should_compare(target) # type: ignore[attr-defined]
return False

@final
Expand Down
168 changes: 65 additions & 103 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
Timestamp,
lib,
)
from pandas._libs.lib import infer_dtype

from pandas.core.dtypes.common import (
DT64NS_DTYPE,
ensure_platform_int,
is_bool_dtype,
is_integer,
Expand Down Expand Up @@ -243,7 +241,7 @@ def cut(

original = x
x_idx = _preprocess_for_cut(x)
x_idx, dtype = _coerce_to_type(x_idx)
x_idx, _ = _coerce_to_type(x_idx)

if not np.iterable(bins):
bins = _nbins_to_bins(x_idx, bins, right)
Expand All @@ -254,16 +252,8 @@ def cut(

else:
bins = Index(bins)
if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype):
bins = np.asarray(bins, dtype=DT64NS_DTYPE)
else:
bins = np.asarray(bins)
bins = _convert_bin_to_numeric_type(bins, dtype)

# GH 26045: cast to float64 to avoid an overflow
if (np.diff(bins.astype("float64")) < 0).any():
if not bins.is_monotonic_increasing:
raise ValueError("bins must increase monotonically.")
bins = Index(bins)

fac, bins = _bins_to_cuts(
x_idx,
Expand All @@ -272,12 +262,11 @@ def cut(
labels=labels,
precision=precision,
include_lowest=include_lowest,
dtype=dtype,
duplicates=duplicates,
ordered=ordered,
)

return _postprocess_for_cut(fac, bins, retbins, dtype, original)
return _postprocess_for_cut(fac, bins, retbins, original)


def qcut(
Expand Down Expand Up @@ -343,25 +332,22 @@ def qcut(
"""
original = x
x_idx = _preprocess_for_cut(x)
x_idx, dtype = _coerce_to_type(x_idx)
x_idx, _ = _coerce_to_type(x_idx)

quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q

x_np = np.asarray(x_idx)
x_np = x_np[~np.isnan(x_np)]
bins = np.quantile(x_np, quantiles)
bins = x_idx.to_series().dropna().quantile(quantiles)

fac, bins = _bins_to_cuts(
x_idx,
Index(bins),
labels=labels,
precision=precision,
include_lowest=True,
dtype=dtype,
duplicates=duplicates,
)

return _postprocess_for_cut(fac, bins, retbins, dtype, original)
return _postprocess_for_cut(fac, bins, retbins, original)


def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
Expand All @@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
rng = (x_idx.min(), x_idx.max())
mn, mx = rng

if np.isinf(mn) or np.isinf(mx):
is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
x_idx.dtype, DatetimeTZDtype
)

if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
# GH#24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)

if mn == mx: # adjust end points before binning
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
if is_dt_or_td:
# using seconds=1 is pretty arbitrary here
td = Timedelta(seconds=1)
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn - td, end=mx + td, periods=nbins + 1, freq=None
)
else:
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001

bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
if is_dt_or_td:
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn, end=mx, periods=nbins + 1, freq=None
)
else:
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
Expand All @@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:


def _bins_to_cuts(
x: Index,
x_idx: Index,
bins: Index,
right: bool = True,
labels=None,
precision: int = 3,
include_lowest: bool = False,
dtype: DtypeObj | None = None,
duplicates: str = "raise",
ordered: bool = True,
):
Expand All @@ -422,7 +430,7 @@ def _bins_to_cuts(

if isinstance(bins, IntervalIndex):
# we have a fast-path here
ids = bins.get_indexer(x)
ids = bins.get_indexer(x_idx)
cat_dtype = CategoricalDtype(bins, ordered=True)
result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
return result, bins
Expand All @@ -437,12 +445,29 @@ def _bins_to_cuts(
bins = unique_bins

side: Literal["left", "right"] = "left" if right else "right"
ids = ensure_platform_int(bins.searchsorted(x, side=side))

try:
ids = bins.searchsorted(x_idx, side=side)
except TypeError as err:
# e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
# is integers
if x_idx.dtype.kind == "m":
raise ValueError("bins must be of timedelta64 dtype") from err
elif x_idx.dtype.kind == bins.dtype.kind == "M":
raise ValueError(
"Cannot use timezone-naive bins with timezone-aware values, "
"or vice-versa"
) from err
elif x_idx.dtype.kind == "M":
raise ValueError("bins must be of datetime64 dtype") from err
else:
raise
ids = ensure_platform_int(ids)

if include_lowest:
ids[np.asarray(x) == bins[0]] = 1
ids[x_idx == bins[0]] = 1

na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
has_nas = na_mask.any()

if labels is not False:
Expand All @@ -454,7 +479,7 @@ def _bins_to_cuts(

if labels is None:
labels = _format_labels(
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
bins, precision, right=right, include_lowest=include_lowest
)
elif ordered and len(set(labels)) != len(labels):
raise ValueError(
Expand Down Expand Up @@ -513,90 +538,28 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
x = Index(x_arr)

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
x_arr = np.where(x.notna(), x.view(np.int64), np.nan)
x = Index(x_arr)

return x, dtype


def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None):
"""
if the passed bin is of datetime/timedelta type,
this method converts it to integer

Parameters
----------
bins : list-like of bins
dtype : dtype of data

Raises
------
ValueError if bins are not of a compat dtype to dtype
"""
bins_dtype = infer_dtype(bins, skipna=False)
if lib.is_np_dtype(dtype, "m"):
if bins_dtype in ["timedelta", "timedelta64"]:
bins = to_timedelta(bins).view(np.int64)
else:
raise ValueError("bins must be of timedelta64 dtype")
elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
if bins_dtype in ["datetime", "datetime64"]:
bins = to_datetime(bins)
if lib.is_np_dtype(bins.dtype, "M"):
# As of 2.0, to_datetime may give non-nano, so we need to convert
# here until the rest of this file recognizes non-nano
bins = bins.astype("datetime64[ns]", copy=False)
bins = bins.view(np.int64)
else:
raise ValueError("bins must be of datetime64 dtype")

return bins


def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
"""
Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
datelike

Parameters
----------
bins : list-like of bins
dtype : dtype of data

Returns
-------
bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
datelike
"""
if isinstance(dtype, DatetimeTZDtype):
bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
elif lib.is_np_dtype(dtype, "mM"):
bins = Index(bins.astype(np.int64), dtype=dtype)
return bins
return Index(x), dtype


def _format_labels(
bins: Index,
precision: int,
right: bool = True,
include_lowest: bool = False,
dtype: DtypeObj | None = None,
):
"""based on the dtype, return our labels"""
closed: IntervalLeftRight = "right" if right else "left"

formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]

if isinstance(dtype, DatetimeTZDtype):
formatter = lambda x: Timestamp(x, tz=dtype.tz)
if isinstance(bins.dtype, DatetimeTZDtype):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(dtype, "M"):
formatter = Timestamp
elif lib.is_np_dtype(bins.dtype, "M"):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(dtype, "m"):
formatter = Timedelta
elif lib.is_np_dtype(bins.dtype, "m"):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
else:
precision = _infer_precision(precision, bins)
Expand Down Expand Up @@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index:
return Index(x)


def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original):
def _postprocess_for_cut(fac, bins, retbins: bool, original):
"""
handles post processing for the cut method where
we combine the index information if the originally passed
Expand All @@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
if not retbins:
return fac

bins = _convert_bin_to_datelike_type(bins, dtype)
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
bins = bins._values

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/interval/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,9 @@ def test_get_indexer_datetime(self):
result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str))
tm.assert_numpy_array_equal(result, expected)

# TODO this should probably be deprecated?
# https://github.com/pandas-dev/pandas/issues/47772
result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8)
expected = np.array([-1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
26 changes: 22 additions & 4 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,15 +495,33 @@ def test_datetime_cut(data):
tm.assert_series_equal(Series(result), expected)


@pytest.mark.parametrize(
"bins",
[
3,
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
def test_datetime_tz_cut_mismatched_tzawareness(box):
# GH#54964
bins = box(
[
Timestamp("2013-01-01 04:57:07.200000"),
Timestamp("2013-01-01 21:00:00"),
Timestamp("2013-01-02 13:00:00"),
Timestamp("2013-01-03 05:00:00"),
]
)
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))

msg = "Cannot use timezone-naive bins with timezone-aware values"
with pytest.raises(ValueError, match=msg):
cut(ser, bins)


@pytest.mark.parametrize(
"bins",
[
3,
[
Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
],
],
)
Expand Down