Skip to content

Commit c0a269b

Browse files
Backport PR #57061 on branch 2.2.x (REGR: non-unique, masked dtype index raising IndexError) (#57142)
Backport PR #57061: REGR: non-unique, masked dtype index raising IndexError Co-authored-by: Luke Manley <[email protected]>
1 parent acd914d commit c0a269b

File tree

3 files changed

+44
-32
lines changed

3 files changed

+44
-32
lines changed

doc/source/whatsnew/v2.2.1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
1818
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
1919
- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
20+
- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
2021
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
2122
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
2223
- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`)

pandas/_libs/index.pyx

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
9696
return indexer.view(bool)
9797

9898

99+
cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length):
100+
"""
101+
Resize array if loc is out of bounds.
102+
"""
103+
cdef:
104+
Py_ssize_t n = len(values)
105+
106+
if loc >= n:
107+
while loc >= n:
108+
n *= 2
109+
values = np.resize(values, min(n, max_length))
110+
return values
111+
112+
99113
# Don't populate hash tables in monotonic indexes larger than this
100114
_SIZE_CUTOFF = 1_000_000
101115

@@ -450,27 +464,18 @@ cdef class IndexEngine:
450464
# found
451465
if val in d:
452466
key = val
453-
467+
result = _maybe_resize_array(
468+
result,
469+
count + len(d[key]) - 1,
470+
max_alloc
471+
)
454472
for j in d[key]:
455-
456-
# realloc if needed
457-
if count >= n_alloc:
458-
n_alloc *= 2
459-
if n_alloc > max_alloc:
460-
n_alloc = max_alloc
461-
result = np.resize(result, n_alloc)
462-
463473
result[count] = j
464474
count += 1
465475

466476
# value not found
467477
else:
468-
469-
if count >= n_alloc:
470-
n_alloc *= 2
471-
if n_alloc > max_alloc:
472-
n_alloc = max_alloc
473-
result = np.resize(result, n_alloc)
478+
result = _maybe_resize_array(result, count, max_alloc)
474479
result[count] = -1
475480
count += 1
476481
missing[count_missing] = i
@@ -1193,37 +1198,31 @@ cdef class MaskedIndexEngine(IndexEngine):
11931198

11941199
if PySequence_GetItem(target_mask, i):
11951200
if na_pos:
1201+
result = _maybe_resize_array(
1202+
result,
1203+
count + len(na_pos) - 1,
1204+
max_alloc,
1205+
)
11961206
for na_idx in na_pos:
1197-
# realloc if needed
1198-
if count >= n_alloc:
1199-
n_alloc *= 2
1200-
if n_alloc > max_alloc:
1201-
n_alloc = max_alloc
1202-
12031207
result[count] = na_idx
12041208
count += 1
12051209
continue
12061210

12071211
elif val in d:
12081212
# found
12091213
key = val
1210-
1214+
result = _maybe_resize_array(
1215+
result,
1216+
count + len(d[key]) - 1,
1217+
max_alloc,
1218+
)
12111219
for j in d[key]:
1212-
1213-
# realloc if needed
1214-
if count >= n_alloc:
1215-
n_alloc *= 2
1216-
if n_alloc > max_alloc:
1217-
n_alloc = max_alloc
1218-
12191220
result[count] = j
12201221
count += 1
12211222
continue
12221223

12231224
# value not found
1224-
if count >= n_alloc:
1225-
n_alloc += 10_000
1226-
result = np.resize(result, n_alloc)
1225+
result = _maybe_resize_array(result, count, max_alloc)
12271226
result[count] = -1
12281227
count += 1
12291228
missing[count_missing] = i

pandas/tests/indexing/test_loc.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3364,3 +3364,15 @@ def test_getitem_loc_str_periodindex(self):
33643364
index = pd.period_range(start="2000", periods=20, freq="B")
33653365
series = Series(range(20), index=index)
33663366
assert series.loc["2000-01-14"] == 9
3367+
3368+
def test_loc_nonunique_masked_index(self):
3369+
# GH 57027
3370+
ids = list(range(11))
3371+
index = Index(ids * 1000, dtype="Int64")
3372+
df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index)
3373+
result = df.loc[ids]
3374+
expected = DataFrame(
3375+
{"val": index.argsort(kind="stable").astype(np.intp)},
3376+
index=Index(np.array(ids).repeat(1000), dtype="Int64"),
3377+
)
3378+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)