-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Use new NA scalar in BooleanArray #29961
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
jorisvandenbossche
merged 3 commits into
pandas-dev:master
from
jorisvandenbossche:boolean-use-NA
Dec 4, 2019
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,10 @@ | ||
import numbers | ||
from typing import TYPE_CHECKING, Type | ||
from typing import TYPE_CHECKING, Any, Tuple, Type | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
from pandas._libs import lib, missing as libmissing | ||
from pandas.compat import set_function_name | ||
|
||
from pandas.core.dtypes.base import ExtensionDtype | ||
|
@@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype): | |
@property | ||
def na_value(self) -> "Scalar": | ||
""" | ||
BooleanDtype uses :attr:`numpy.nan` as the missing NA value. | ||
BooleanDtype uses :attr:`pandas.NA` as the missing NA value. | ||
|
||
.. warning:: | ||
|
||
`na_value` may change in a future release. | ||
""" | ||
return np.nan | ||
return libmissing.NA | ||
|
||
@property | ||
def type(self) -> Type: | ||
|
@@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): | |
|
||
>>> pd.array([True, False, None], dtype="boolean") | ||
<BooleanArray> | ||
[True, False, NaN] | ||
[True, False, NA] | ||
Length: 3, dtype: boolean | ||
""" | ||
|
||
|
@@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False): | |
values, mask = coerce_to_array(scalars, copy=copy) | ||
return BooleanArray(values, mask) | ||
|
||
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: | ||
data = self._data.astype("int8") | ||
data[self._mask] = -1 | ||
return data, -1 | ||
|
||
@classmethod | ||
def _from_factorized(cls, values, original: "BooleanArray"): | ||
return cls._from_sequence(values, dtype=original.dtype) | ||
|
||
def _formatter(self, boxed=False): | ||
def fmt(x): | ||
if isna(x): | ||
return "NaN" | ||
return str(x) | ||
|
||
return fmt | ||
return str | ||
|
||
def __getitem__(self, item): | ||
if is_integer(item): | ||
|
@@ -281,25 +281,29 @@ def __getitem__(self, item): | |
return self._data[item] | ||
return type(self)(self._data[item], self._mask[item]) | ||
|
||
def _coerce_to_ndarray(self, force_bool: bool = False): | ||
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): | ||
""" | ||
Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). | ||
|
||
Parameters | ||
---------- | ||
force_bool : bool, default False | ||
If True, return bool array or raise error if not possible (in | ||
presence of missing values) | ||
dtype : dtype, default object | ||
The numpy dtype to convert to | ||
na_value : scalar, optional | ||
Scalar missing value indicator to use in numpy array. Defaults | ||
to the native missing value indicator of this array (pd.NA). | ||
""" | ||
if force_bool: | ||
if dtype is None: | ||
dtype = object | ||
if is_bool_dtype(dtype): | ||
if not self.isna().any(): | ||
return self._data | ||
else: | ||
raise ValueError( | ||
"cannot convert to bool numpy array in presence of missing values" | ||
) | ||
data = self._data.astype(object) | ||
data[self._mask] = self._na_value | ||
data = self._data.astype(dtype) | ||
data[self._mask] = na_value | ||
return data | ||
|
||
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us | ||
|
@@ -309,15 +313,8 @@ def __array__(self, dtype=None): | |
the array interface, return my values | ||
We return an object array here to preserve our scalar values | ||
""" | ||
if dtype is not None: | ||
if is_bool_dtype(dtype): | ||
return self._coerce_to_ndarray(force_bool=True) | ||
# TODO can optimize this to not go through object dtype for | ||
# numeric dtypes | ||
arr = self._coerce_to_ndarray() | ||
return arr.astype(dtype, copy=False) | ||
# by default (no dtype specified), return an object array | ||
return self._coerce_to_ndarray() | ||
return self._coerce_to_ndarray(dtype=dtype) | ||
|
||
def __arrow_array__(self, type=None): | ||
""" | ||
|
@@ -483,8 +480,17 @@ def astype(self, dtype, copy=True): | |
return IntegerArray( | ||
self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False | ||
) | ||
# for integer, error if there are missing values | ||
if is_integer_dtype(dtype): | ||
if self.isna().any(): | ||
raise ValueError("cannot convert NA to integer") | ||
# for float dtype, ensure we use np.nan before casting (numpy cannot | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pending the discussion in #30038. |
||
# deal with pd.NA) | ||
na_value = self._na_value | ||
if is_float_dtype(dtype): | ||
na_value = np.nan | ||
# coerce | ||
data = self._coerce_to_ndarray() | ||
data = self._coerce_to_ndarray(na_value=na_value) | ||
return astype_nansafe(data, dtype, copy=None) | ||
|
||
def value_counts(self, dropna=True): | ||
|
@@ -594,8 +600,6 @@ def logical_method(self, other): | |
|
||
@classmethod | ||
def _create_comparison_method(cls, op): | ||
op_name = op.__name__ | ||
|
||
def cmp_method(self, other): | ||
|
||
if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): | ||
|
@@ -617,21 +621,26 @@ def cmp_method(self, other): | |
if len(self) != len(other): | ||
raise ValueError("Lengths must match to compare") | ||
|
||
# numpy will show a DeprecationWarning on invalid elementwise | ||
# comparisons, this will raise in the future | ||
with warnings.catch_warnings(): | ||
warnings.filterwarnings("ignore", "elementwise", FutureWarning) | ||
with np.errstate(all="ignore"): | ||
result = op(self._data, other) | ||
|
||
# nans propagate | ||
if mask is None: | ||
mask = self._mask | ||
if other is libmissing.NA: | ||
# numpy does not handle pd.NA well as "other" scalar (it returns | ||
# a scalar False instead of an array) | ||
result = np.zeros_like(self._data) | ||
mask = np.ones_like(self._data) | ||
else: | ||
mask = self._mask | mask | ||
# numpy will show a DeprecationWarning on invalid elementwise | ||
# comparisons, this will raise in the future | ||
with warnings.catch_warnings(): | ||
warnings.filterwarnings("ignore", "elementwise", FutureWarning) | ||
with np.errstate(all="ignore"): | ||
result = op(self._data, other) | ||
|
||
# nans propagate | ||
if mask is None: | ||
mask = self._mask.copy() | ||
else: | ||
mask = self._mask | mask | ||
|
||
result[mask] = op_name == "ne" | ||
return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False) | ||
return BooleanArray(result, mask, copy=False) | ||
|
||
name = "__{name}__".format(name=op.__name__) | ||
return set_function_name(cmp_method, name, cls) | ||
|
@@ -643,7 +652,7 @@ def _reduce(self, name, skipna=True, **kwargs): | |
# coerce to a nan-aware float if needed | ||
if mask.any(): | ||
data = self._data.astype("float64") | ||
data[mask] = self._na_value | ||
data[mask] = np.nan | ||
|
||
op = getattr(nanops, "nan" + name) | ||
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.