-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: algorithms.factorize moves null values when sort=False #46601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
670c2e8
98c6c18
007329b
ffaf20c
58e5556
c600e9a
f7326bd
b0ec48a
395c9cf
d0796ed
351eb0d
cadab86
f44b7f3
f93f968
ef49c74
2a439eb
8378ba0
f2e24df
dc20283
b51e88f
4a36bf0
6db0685
ca53df0
372efe7
cf56135
0b85a3d
bc3f426
57a05a7
9c35dd0
a7c3538
b27bda0
b45ace7
c4cfbc6
ecb182c
7143a52
82b61b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -869,16 +869,43 @@ def searchsorted( | |
return self._data.searchsorted(value, side=side, sorter=sorter) | ||
|
||
@doc(ExtensionArray.factorize) | ||
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: | ||
def factorize( | ||
self, na_sentinel: int = -1, dropna: bool = True | ||
) -> tuple[np.ndarray, ExtensionArray]: | ||
arr = self._data | ||
mask = self._mask | ||
|
||
codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) | ||
codes, uniques = factorize_array( | ||
arr, na_sentinel=na_sentinel, mask=mask, dropna=True | ||
) | ||
|
||
# check that factorize_array correctly preserves dtype. | ||
assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) | ||
|
||
uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) | ||
# Make room for a null value if we're not ignoring it and it exists | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would it make sense to share any of this with the ArrowArray version? not for this PR, but could have a TODO There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, will add a TODO. Once we drop support for pyarrow < 4.0 we won't need this logic in ArrowArray, but 4.0 is only a year old at this point so that will be a while. |
||
size = len(uniques) if dropna or not mask.any() else len(uniques) + 1 | ||
uniques_mask = np.zeros(size, dtype=bool) | ||
if not dropna: | ||
na_index = mask.argmax() | ||
if mask[na_index]: | ||
# Insert na with the proper code | ||
if na_index == 0: | ||
na_code = np.intp(0) | ||
else: | ||
# error: Slice index must be an integer or None | ||
# https://github.com/python/mypy/issues/2410 | ||
na_code = codes[:na_index].argmax() + 1 # type: ignore[misc] | ||
if na_sentinel < 0: | ||
# codes can never equal na_sentinel and be >= na_code | ||
codes[codes >= na_code] += 1 | ||
else: | ||
codes[(codes >= na_code) & (codes != na_sentinel)] += 1 | ||
codes[codes == na_sentinel] = na_code | ||
# dummy value for uniques; not used since uniques_mask will be True | ||
uniques = np.insert(uniques, na_code, 0) | ||
uniques_mask[na_code] = True | ||
uniques_ea = type(self)(uniques, uniques_mask) | ||
|
||
return codes, uniques_ea | ||
|
||
@doc(ExtensionArray._values_for_argsort) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -382,6 +382,13 @@ def _values_for_factorize(self): | |
arr[mask] = -1 | ||
return arr, -1 | ||
|
||
@classmethod | ||
def _from_factorized(cls, values, original): | ||
assert values.dtype == original._ndarray.dtype | ||
# When dropna (i.e. ignore_na) is False, can get -1 from nulls | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any way we could avoid this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks - yes. Changed _values_for_factorize from -1 to None. |
||
values[values == -1] = None | ||
return original._from_backing_data(values) | ||
|
||
def __setitem__(self, key, value): | ||
value = extract_array(value, extract_numpy=True) | ||
if isinstance(value, type(self)): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -657,9 +657,10 @@ def group_index(self) -> Index: | |
|
||
@cache_readonly | ||
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: | ||
if self._passed_categorical: | ||
if self._dropna and self._passed_categorical: | ||
# we make a CategoricalIndex out of the cat grouper | ||
# preserving the categories / ordered attributes | ||
# preserving the categories / ordered attributes; | ||
# doesn't (yet) handle dropna=False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GH ref for the "yet"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened #46909, will add a reference in this comment. |
||
cat = self.grouping_vector | ||
categories = cat.categories | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.