Skip to content

Commit bb904cb

Browse files
jorisvandenbosscheTomAugspurger
authored andcommitted
ENH: add BooleanArray extension array (pandas-dev#29555)
1 parent 7d7f885 commit bb904cb

File tree

4 files changed

+205
-7
lines changed

4 files changed

+205
-7
lines changed

doc/source/boolean.rst

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
.. currentmodule:: pandas
2+
3+
.. _boolean:
4+
5+
**************************
6+
Nullable Boolean Data Type
7+
**************************
8+
9+
.. versionadded:: 1.0.0
10+
11+
.. _boolean.klean:
12+
13+
Kleene Logic
14+
------------
15+
16+
:class:`arrays.BooleanArray` implements Kleene logic (sometime called three-value logic) for
17+
logical operations like ``&`` (and), ``|`` (or) and ``^`` (exclusive-or).
18+
19+
Here's a table for ``and``.
20+
21+
========== =========== ============
22+
left value right value output value
23+
========== =========== ============
24+
True True True
25+
True False False
26+
True NA NA
27+
False False False
28+
False NA False
29+
NA NA NA
30+
========== =========== ============
31+
32+
33+
And for ``or``
34+
35+
========== =========== ============
36+
left value right value output value
37+
========== =========== ============
38+
True True True
39+
True False True
40+
True NA True
41+
False False False
42+
False NA NA
43+
NA NA NA
44+
========== =========== ============
45+
46+
And for ``xor``
47+
48+
========== =========== ============
49+
left value right value output value
50+
========== =========== ============
51+
True True False
52+
True False True
53+
True NA NA
54+
False False False
55+
False NA NA
56+
NA NA NA
57+
========== =========== ============
58+
59+
When an ``NA`` is present in an operation, the output value is ``NA`` only if
60+
the result cannot be determined soley based on the other input. For example,
61+
``True | NA`` is ``True``, because both ``True | True`` and ``True | False``
62+
are ``True``. In that case, we don't actually need to consider the value
63+
of the ``NA``.
64+
65+
On the other hand, ``True & NA`` is ``NA``. The result depends on whether
66+
the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``,
67+
but ``True & False`` is ``False``, so we can't determine the output.
68+
69+
70+
This differs from how ``np.nan`` behaves in logical operations. Pandas treated
71+
``np.nan`` is *always false in the output*.
72+
73+
In ``or``
74+
75+
.. ipython:: python
76+
77+
pd.Series([True, False, np.nan], dtype="object") | True
78+
pd.Series([True, False, np.nan], dtype="boolean") | True
79+
80+
In ``and``
81+
82+
pd.Series([True, False, np.nan], dtype="object") & True
83+
pd.Series([True, False, np.nan], dtype="boolean") & True

doc/source/index.rst.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ See the :ref:`overview` for more detail about what's in the library.
7373
* :doc:`user_guide/missing_data`
7474
* :doc:`user_guide/categorical`
7575
* :doc:`user_guide/integer_na`
76+
* :doc:`user_guide/boolean`
7677
* :doc:`user_guide/visualization`
7778
* :doc:`user_guide/computation`
7879
* :doc:`user_guide/groupby`

pandas/core/arrays/boolean.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
184184
represented by 2 numpy arrays: a boolean array with the data and
185185
a boolean array with the mask (True indicating missing).
186186
187+
BooleanArray implements Kleene logic (sometimes called three-value
188+
logic) for logical operations. See :ref:`` for more.
189+
187190
To construct an BooleanArray from generic array-like input, use
188191
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
189192
below).
@@ -560,10 +563,12 @@ def logical_method(self, other):
560563
return NotImplemented
561564

562565
other = lib.item_from_zerodim(other)
563-
mask = None
566+
omask = mask = None
567+
other_is_booleanarray = isinstance(other, BooleanArray)
564568

565-
if isinstance(other, BooleanArray):
566-
other, mask = other._data, other._mask
569+
if other_is_booleanarray:
570+
other, omask = other._data, other._mask
571+
mask = omask
567572
elif is_list_like(other):
568573
other = np.asarray(other, dtype="bool")
569574
if other.ndim > 1:
@@ -576,17 +581,38 @@ def logical_method(self, other):
576581

577582
# numpy will show a DeprecationWarning on invalid elementwise
578583
# comparisons, this will raise in the future
579-
with warnings.catch_warnings():
580-
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
581-
with np.errstate(all="ignore"):
582-
result = op(self._data, other)
584+
if lib.is_scalar(other) and np.isnan(
585+
other
586+
): # TODO(NA): change to libmissing.NA:
587+
result = self._data
588+
mask = True
589+
else:
590+
with warnings.catch_warnings():
591+
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
592+
with np.errstate(all="ignore"):
593+
result = op(self._data, other)
583594

584595
# nans propagate
585596
if mask is None:
586597
mask = self._mask
587598
else:
588599
mask = self._mask | mask
589600

601+
# Kleene-logic adjustments to the mask.
602+
if op.__name__ in {"or_", "ror_"}:
603+
mask[result] = False
604+
elif op.__name__ in {"and_", "rand_"}:
605+
mask[~self._data & ~self._mask] = False
606+
if other_is_booleanarray:
607+
mask[~other & ~omask] = False
608+
elif lib.is_scalar(other) and np.isnan(other): # TODO(NA): change to NA
609+
mask[:] = True
610+
# Do we ever assume that masked values are False?
611+
result[mask] = False
612+
elif op.__name__ in {"xor", "rxor"}:
613+
# Do we ever assume that masked values are False?
614+
result[mask] = False
615+
590616
return BooleanArray(result, mask)
591617

592618
name = "__{name}__".format(name=op.__name__)

pandas/tests/arrays/test_boolean.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,13 +391,101 @@ def test_scalar(self, data, all_logical_operators):
391391

392392
def test_array(self, data, all_logical_operators):
393393
op_name = all_logical_operators
394+
if "or" in op_name:
395+
pytest.skip("confusing")
394396
other = pd.array([True] * len(data), dtype="boolean")
395397
self._compare_other(data, op_name, other)
396398
other = np.array([True] * len(data))
397399
self._compare_other(data, op_name, other)
398400
other = pd.Series([True] * len(data), dtype="boolean")
399401
self._compare_other(data, op_name, other)
400402

403+
def test_kleene_or(self):
404+
# A clear test of behavior.
405+
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
406+
b = pd.array([True, False, None] * 3, dtype="boolean")
407+
result = a | b
408+
expected = pd.array(
409+
[True, True, True, True, False, None, True, None, None], dtype="boolean"
410+
)
411+
tm.assert_extension_array_equal(result, expected)
412+
413+
result = b | a
414+
tm.assert_extension_array_equal(result, expected)
415+
416+
def test_kleene_or_scalar(self):
417+
a = pd.array([True, False, None], dtype="boolean")
418+
result = a | np.nan # TODO: pd.NA
419+
expected = pd.array([True, None, None], dtype="boolean")
420+
tm.assert_extension_array_equal(result, expected)
421+
422+
result = np.nan | a # TODO: pd.NA
423+
tm.assert_extension_array_equal(result, expected)
424+
425+
@pytest.mark.parametrize(
426+
"left,right,expected",
427+
[
428+
([True, False, None], True, [True, True, True]),
429+
([True, False, None], False, [True, False, None]),
430+
([True, False, None], np.nan, [True, None, None]),
431+
# TODO: pd.NA
432+
],
433+
)
434+
def test_kleene_or_cases(self, left, right, expected):
435+
if isinstance(left, list):
436+
left = pd.array(left, dtype="boolean")
437+
if isinstance(right, list):
438+
right = pd.array(right, dtype="boolean")
439+
expected = pd.array(expected, dtype="boolean")
440+
result = left | right
441+
tm.assert_extension_array_equal(result, expected)
442+
443+
result = right | left
444+
tm.assert_extension_array_equal(result, expected)
445+
446+
def test_kleene_and(self):
447+
# A clear test of behavior.
448+
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
449+
b = pd.array([True, False, None] * 3, dtype="boolean")
450+
result = a & b
451+
expected = pd.array(
452+
[True, False, None, False, False, False, None, False, None], dtype="boolean"
453+
)
454+
tm.assert_extension_array_equal(result, expected)
455+
456+
result = b & a
457+
tm.assert_extension_array_equal(result, expected)
458+
459+
def test_kleene_and_scalar(self):
460+
a = pd.array([True, False, None], dtype="boolean")
461+
result = a & np.nan # TODO: pd.NA
462+
expected = pd.array([None, None, None], dtype="boolean")
463+
tm.assert_extension_array_equal(result, expected)
464+
465+
result = np.nan & a # TODO: pd.na
466+
tm.assert_extension_array_equal(result, expected)
467+
468+
def test_kleene_xor(self):
469+
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
470+
b = pd.array([True, False, None] * 3, dtype="boolean")
471+
result = a ^ b
472+
expected = pd.array(
473+
[False, True, None, True, False, None, None, None, None], dtype="boolean"
474+
)
475+
tm.assert_extension_array_equal(result, expected)
476+
477+
result = b ^ a
478+
tm.assert_extension_array_equal(result, expected)
479+
480+
def test_kleene_scalar(self):
481+
a = pd.array([True, False, None], dtype="boolean")
482+
result = a ^ np.nan # TODO: pd.NA
483+
expected = pd.array([None, None, None], dtype="boolean")
484+
tm.assert_extension_array_equal(result, expected)
485+
486+
result = np.nan ^ a # TODO: pd.NA
487+
tm.assert_extension_array_equal(result, expected)
488+
401489

402490
class TestComparisonOps(BaseOpsUtil):
403491
def _compare_other(self, data, op_name, other):

0 commit comments

Comments
 (0)