Skip to content

TYP: core.sorting #41285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6146,7 +6146,7 @@ def duplicated(
if self.empty:
return self._constructor_sliced(dtype=bool)

def f(vals):
def f(vals) -> tuple[np.ndarray, int]:
labels, shape = algorithms.factorize(vals, size_hint=len(self))
return labels.astype("i8", copy=False), len(shape)

Expand All @@ -6173,7 +6173,14 @@ def f(vals):
vals = (col.values for name, col in self.items() if name in subset)
labels, shape = map(list, zip(*map(f, vals)))

ids = get_group_index(labels, shape, sort=False, xnull=False)
ids = get_group_index(
labels,
# error: Argument 1 to "tuple" has incompatible type "List[_T]";
# expected "Iterable[int]"
tuple(shape), # type: ignore[arg-type]
sort=False,
xnull=False,
)
result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index)
return result.__finalize__(self, method="duplicated")

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1611,7 +1611,7 @@ def _inferred_type_levels(self) -> list[str]:

@doc(Index.duplicated)
def duplicated(self, keep="first") -> np.ndarray:
shape = map(len, self.levels)
shape = tuple(len(lev) for lev in self.levels)
ids = get_group_index(self.codes, shape, sort=False, xnull=False)

return duplicated_int64(ids, keep)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def _indexer_and_to_sort(
codes = list(self.index.codes)
levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])

comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
ngroups = len(obs_ids)
Expand All @@ -166,7 +166,7 @@ def _make_selectors(self):

# make the mask
remaining_labels = self.sorted_labels[:-1]
level_sizes = [len(x) for x in new_levels]
level_sizes = tuple(len(x) for x in new_levels)

comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
ngroups = len(obs_ids)
Expand Down Expand Up @@ -353,7 +353,7 @@ def _unstack_multiple(data, clocs, fill_value=None):
rcodes = [index.codes[i] for i in rlocs]
rnames = [index.names[i] for i in rlocs]

shape = [len(x) for x in clevels]
shape = tuple(len(x) for x in clevels)
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)

comp_ids, obs_ids = compress_group_index(group_index, sort=False)
Expand Down
49 changes: 27 additions & 22 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
lib,
)
from pandas._libs.hashtable import unique_label_indices
from pandas._typing import IndexKeyFunc
from pandas._typing import (
IndexKeyFunc,
Shape,
)

from pandas.core.dtypes.common import (
ensure_int64,
Expand Down Expand Up @@ -93,7 +96,7 @@ def get_indexer_indexer(
return indexer


def get_group_index(labels, shape, sort: bool, xnull: bool):
def get_group_index(labels, shape: Shape, sort: bool, xnull: bool):
"""
For the particular label_list, gets the offsets into the hypothetical list
representing the totally ordered cartesian product of all possible label
Expand All @@ -108,7 +111,7 @@ def get_group_index(labels, shape, sort: bool, xnull: bool):
----------
labels : sequence of arrays
Integers identifying levels at each location
shape : sequence of ints
shape : tuple[int, ...]
Number of unique levels at each location
sort : bool
If the ranks of returned ids should match lexical ranks of labels
Expand All @@ -134,33 +137,36 @@ def _int64_cut_off(shape) -> int:
return i
return len(shape)

def maybe_lift(lab, size):
def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
# promote nan values (assigned -1 label in lab array)
# so that all output values are non-negative
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

labels = map(ensure_int64, labels)
labels = [ensure_int64(x) for x in labels]
lshape = list(shape)
if not xnull:
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
for i, (lab, size) in enumerate(zip(labels, shape)):
lab, size = maybe_lift(lab, size)
labels[i] = lab
lshape[i] = size

labels = list(labels)
shape = list(shape)

# Iteratively process all the labels in chunks sized so less
# than _INT64_MAX unique int ids will be required for each chunk
while True:
# how many levels can be done without overflow:
nlev = _int64_cut_off(shape)
nlev = _int64_cut_off(lshape)

# compute flat ids for the first `nlev` levels
stride = np.prod(shape[1:nlev], dtype="i8")
stride = np.prod(lshape[1:nlev], dtype="i8")
out = stride * labels[0].astype("i8", subok=False, copy=False)

for i in range(1, nlev):
if shape[i] == 0:
stride = 0
if lshape[i] == 0:
stride = np.int64(0)
else:
stride //= shape[i]
stride //= lshape[i]
out += labels[i] * stride

if xnull: # exclude nulls
Expand All @@ -169,20 +175,20 @@ def maybe_lift(lab, size):
mask |= lab == -1
out[mask] = -1

if nlev == len(shape): # all levels done!
if nlev == len(lshape): # all levels done!
break

# compress what has been done so far in order to avoid overflow
# to retain lexical ranks, obs_ids should be sorted
comp_ids, obs_ids = compress_group_index(out, sort=sort)

labels = [comp_ids] + labels[nlev:]
shape = [len(obs_ids)] + shape[nlev:]
lshape = [len(obs_ids)] + lshape[nlev:]

return out


def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]:
def get_compressed_ids(labels, sizes: Shape) -> tuple[np.ndarray, np.ndarray]:
"""
Group_index is offsets into cartesian product of all possible labels. This
space can be huge, so this function compresses it, by computing offsets
Expand All @@ -191,7 +197,7 @@ def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]:
Parameters
----------
labels : list of label arrays
sizes : list of size of the levels
sizes : tuple[int] of size of the levels

Returns
-------
Expand Down Expand Up @@ -252,12 +258,11 @@ def decons_obs_group_ids(comp_ids: np.ndarray, obs_ids, shape, labels, xnull: bo
return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]

# TODO: unique_label_indices only used here, should take ndarray[np.intp]
i = unique_label_indices(ensure_int64(comp_ids))
i8copy = lambda a: a.astype("i8", subok=False, copy=True)
return [i8copy(lab[i]) for lab in labels]
indexer = unique_label_indices(ensure_int64(comp_ids))
return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]


def indexer_from_factorized(labels, shape, compress: bool = True) -> np.ndarray:
def indexer_from_factorized(labels, shape: Shape, compress: bool = True) -> np.ndarray:
# returned ndarray is np.intp
ids = get_group_index(labels, shape, sort=True, xnull=False)

Expand Down Expand Up @@ -334,7 +339,7 @@ def lexsort_indexer(
shape.append(n)
labels.append(codes)

return indexer_from_factorized(labels, shape)
return indexer_from_factorized(labels, tuple(shape))


def nargsort(
Expand Down Expand Up @@ -576,7 +581,7 @@ def get_indexer_dict(
"""
shape = [len(x) for x in keys]

group_index = get_group_index(label_list, shape, sort=True, xnull=True)
group_index = get_group_index(label_list, tuple(shape), sort=True, xnull=True)
if np.all(group_index == -1):
# Short-circuit, lib.indices_fast will return the same
return {}
Expand Down