ENH: add BooleanArray extension array (pandas-dev#29555)

jorisvandenbossche · TomAugspurger · commit bb904cb48241 · 2019-11-25T15:55:42.000-06:00
diff --git a/doc/source/boolean.rst b/doc/source/boolean.rst
@@ -0,0 +1,83 @@
+.. currentmodule:: pandas
+
+.. _boolean:
+
+**************************
+Nullable Boolean Data Type
+**************************
+
+.. versionadded:: 1.0.0
+
+.. _boolean.klean:
+
+Kleene Logic
+------------
+
+:class:`arrays.BooleanArray` implements Kleene logic (sometime called three-value logic) for
+logical operations like ``&`` (and), ``|`` (or) and ``^`` (exclusive-or).
+
+Here's a table for ``and``.
+
+==========  ===========  ============
+left value  right value  output value
+==========  ===========  ============
+True        True         True
+True        False        False
+True        NA           NA
+False       False        False
+False       NA           False
+NA          NA           NA
+==========  ===========  ============
+
+
+And for ``or``
+
+==========  ===========  ============
+left value  right value  output value
+==========  ===========  ============
+True        True         True
+True        False        True
+True        NA           True
+False       False        False
+False       NA           NA
+NA          NA           NA
+==========  ===========  ============
+
+And for ``xor``
+
+==========  ===========  ============
+left value  right value  output value
+==========  ===========  ============
+True        True         False
+True        False        True
+True        NA           NA
+False       False        False
+False       NA           NA
+NA          NA           NA
+==========  ===========  ============
+
+When an ``NA`` is present in an operation, the output value is ``NA`` only if
+the result cannot be determined soley based on the other input. For example,
+``True | NA`` is ``True``, because both ``True | True`` and ``True | False``
+are ``True``. In that case, we don't actually need to consider the value
+of the ``NA``.
+
+On the other hand, ``True & NA`` is ``NA``. The result depends on whether
+the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``,
+but ``True & False`` is ``False``, so we can't determine the output.
+
+
+This differs from how ``np.nan`` behaves in logical operations. Pandas treated
+``np.nan`` is *always false in the output*.
+
+In ``or``
+
+.. ipython:: python
+
+   pd.Series([True, False, np.nan], dtype="object") | True
+   pd.Series([True, False, np.nan], dtype="boolean") | True
+
+In ``and``
+
+   pd.Series([True, False, np.nan], dtype="object") & True
+   pd.Series([True, False, np.nan], dtype="boolean") & True
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
@@ -73,6 +73,7 @@ See the :ref:`overview` for more detail about what's in the library.
   * :doc:`user_guide/missing_data`
   * :doc:`user_guide/categorical`
   * :doc:`user_guide/integer_na`
+  * :doc:`user_guide/boolean`
   * :doc:`user_guide/visualization`
   * :doc:`user_guide/computation`
   * :doc:`user_guide/groupby`
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -184,6 +184,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
     represented by 2 numpy arrays: a boolean array with the data and
     a boolean array with the mask (True indicating missing).
 
+    BooleanArray implements Kleene logic (sometimes called three-value
+    logic) for logical operations. See :ref:`` for more.
+
     To construct an BooleanArray from generic array-like input, use
     :func:`pandas.array` specifying ``dtype="boolean"`` (see examples
     below).
@@ -560,10 +563,12 @@ def logical_method(self, other):
                 return NotImplemented
 
             other = lib.item_from_zerodim(other)
-            mask = None
+            omask = mask = None
+            other_is_booleanarray = isinstance(other, BooleanArray)
 
-            if isinstance(other, BooleanArray):
-                other, mask = other._data, other._mask
+            if other_is_booleanarray:
+                other, omask = other._data, other._mask
+                mask = omask
             elif is_list_like(other):
                 other = np.asarray(other, dtype="bool")
                 if other.ndim > 1:
@@ -576,17 +581,38 @@ def logical_method(self, other):
 
             # numpy will show a DeprecationWarning on invalid elementwise
             # comparisons, this will raise in the future
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", "elementwise", FutureWarning)
-                with np.errstate(all="ignore"):
-                    result = op(self._data, other)
+            if lib.is_scalar(other) and np.isnan(
+                other
+            ):  # TODO(NA): change to libmissing.NA:
+                result = self._data
+                mask = True
+            else:
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", "elementwise", FutureWarning)
+                    with np.errstate(all="ignore"):
+                        result = op(self._data, other)
 
             # nans propagate
             if mask is None:
                 mask = self._mask
             else:
                 mask = self._mask | mask
 
+            # Kleene-logic adjustments to the mask.
+            if op.__name__ in {"or_", "ror_"}:
+                mask[result] = False
+            elif op.__name__ in {"and_", "rand_"}:
+                mask[~self._data & ~self._mask] = False
+                if other_is_booleanarray:
+                    mask[~other & ~omask] = False
+                elif lib.is_scalar(other) and np.isnan(other):  # TODO(NA): change to NA
+                    mask[:] = True
+                # Do we ever assume that masked values are False?
+                result[mask] = False
+            elif op.__name__ in {"xor", "rxor"}:
+                # Do we ever assume that masked values are False?
+                result[mask] = False
+
             return BooleanArray(result, mask)
 
         name = "__{name}__".format(name=op.__name__)
diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -391,13 +391,101 @@ def test_scalar(self, data, all_logical_operators):
 
     def test_array(self, data, all_logical_operators):
         op_name = all_logical_operators
+        if "or" in op_name:
+            pytest.skip("confusing")
         other = pd.array([True] * len(data), dtype="boolean")
         self._compare_other(data, op_name, other)
         other = np.array([True] * len(data))
         self._compare_other(data, op_name, other)
         other = pd.Series([True] * len(data), dtype="boolean")
         self._compare_other(data, op_name, other)
 
+    def test_kleene_or(self):
+        # A clear test of behavior.
+        a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
+        b = pd.array([True, False, None] * 3, dtype="boolean")
+        result = a | b
+        expected = pd.array(
+            [True, True, True, True, False, None, True, None, None], dtype="boolean"
+        )
+        tm.assert_extension_array_equal(result, expected)
+
+        result = b | a
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_kleene_or_scalar(self):
+        a = pd.array([True, False, None], dtype="boolean")
+        result = a | np.nan  # TODO: pd.NA
+        expected = pd.array([True, None, None], dtype="boolean")
+        tm.assert_extension_array_equal(result, expected)
+
+        result = np.nan | a  # TODO: pd.NA
+        tm.assert_extension_array_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "left,right,expected",
+        [
+            ([True, False, None], True, [True, True, True]),
+            ([True, False, None], False, [True, False, None]),
+            ([True, False, None], np.nan, [True, None, None]),
+            # TODO: pd.NA
+        ],
+    )
+    def test_kleene_or_cases(self, left, right, expected):
+        if isinstance(left, list):
+            left = pd.array(left, dtype="boolean")
+        if isinstance(right, list):
+            right = pd.array(right, dtype="boolean")
+        expected = pd.array(expected, dtype="boolean")
+        result = left | right
+        tm.assert_extension_array_equal(result, expected)
+
+        result = right | left
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_kleene_and(self):
+        # A clear test of behavior.
+        a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
+        b = pd.array([True, False, None] * 3, dtype="boolean")
+        result = a & b
+        expected = pd.array(
+            [True, False, None, False, False, False, None, False, None], dtype="boolean"
+        )
+        tm.assert_extension_array_equal(result, expected)
+
+        result = b & a
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_kleene_and_scalar(self):
+        a = pd.array([True, False, None], dtype="boolean")
+        result = a & np.nan  # TODO: pd.NA
+        expected = pd.array([None, None, None], dtype="boolean")
+        tm.assert_extension_array_equal(result, expected)
+
+        result = np.nan & a  # TODO: pd.na
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_kleene_xor(self):
+        a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
+        b = pd.array([True, False, None] * 3, dtype="boolean")
+        result = a ^ b
+        expected = pd.array(
+            [False, True, None, True, False, None, None, None, None], dtype="boolean"
+        )
+        tm.assert_extension_array_equal(result, expected)
+
+        result = b ^ a
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_kleene_scalar(self):
+        a = pd.array([True, False, None], dtype="boolean")
+        result = a ^ np.nan  # TODO: pd.NA
+        expected = pd.array([None, None, None], dtype="boolean")
+        tm.assert_extension_array_equal(result, expected)
+
+        result = np.nan ^ a  # TODO: pd.NA
+        tm.assert_extension_array_equal(result, expected)
+
 
 class TestComparisonOps(BaseOpsUtil):
     def _compare_other(self, data, op_name, other):