Updates

TomAugspurger · TomAugspurger · commit fbc4425a09bb · 2018-04-26T13:37:45.000-05:00
* indexer -&gt; indices
* doc user-facing vs physical
* assert na_cmps
* test reindex w/ non-NA fill_value
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1448,8 +1448,9 @@ def func(arr, indexer, out, fill_value=np.nan):
     return func
 
 
-def take(arr, indexer, allow_fill=False, fill_value=None):
-    """Take elements from an array.
+def take(arr, indices, allow_fill=False, fill_value=None):
+    """
+    Take elements from an array.
 
     .. versionadded:: 0.23.0
 
@@ -1458,22 +1459,23 @@ def take(arr, indexer, allow_fill=False, fill_value=None):
     arr : sequence
         Non array-likes (sequences without a dtype) are coereced
         to an ndarray.
-    indexer : sequence of integers
+    indices : sequence of integers
         Indices to be taken.
     allow_fill : bool, default False
-        How to handle negative values in `indexer`.
+        How to handle negative values in `indices`.
 
-        * False: negative values in `indexer` indicate
-          slices from the right (the default)
+        * False: negative values in `indices` indicate indexing from
+          the right (the default). This is similar to :func:`numpy.take`.
 
-        * True: negative values in `indexer` indicate
+        * True: negative values in `indices` indicate
           missing values. These values are set to `fill_value`. Any other
           other negative values raise a ``ValueError``.
 
     fill_value : any, optional
         Fill value to use for NA-indicies when `allow_fill` is True.
         This may be ``None``, in which case the default NA value for
-        the type, ``self.dtype.na_value``, is used.
+        the type is used. For ndarrays, :attr:`numpy.nan` is used. For
+        ExtensionArrays, a different value may be used.
 
     Returns
     -------
@@ -1483,17 +1485,17 @@ def take(arr, indexer, allow_fill=False, fill_value=None):
     Raises
     ------
     IndexError
-        When the indexer is out of bounds for the array.
+        When `indices` is out of bounds for the array.
     ValueError
         When the indexer contains negative values other than ``-1``
         and `allow_fill` is True.
 
     Notes
     -----
-    When `allow_fill` is False, `indexer` may be whatever dimensionality
+    When `allow_fill` is False, `indices` may be whatever dimensionality
     is accepted by NumPy for `arr`.
 
-    When `allow_fill` is True, `indexer` should be 1-D.
+    When `allow_fill` is True, `indices` should be 1-D.
 
     See Also
     --------
@@ -1524,15 +1526,15 @@ def take(arr, indexer, allow_fill=False, fill_value=None):
         arr = np.asarray(arr)
 
     # Do we require int64 or intp here?
-    indexer = np.asarray(indexer, dtype='int')
+    indices = np.asarray(indices, dtype='int')
 
     if allow_fill:
         # Pandas style, -1 means NA
-        validate_indices(indexer, len(arr))
-        result = take_1d(arr, indexer, allow_fill=True, fill_value=fill_value)
+        validate_indices(indices, len(arr))
+        result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value)
     else:
         # NumPy style
-        result = arr.take(indexer)
+        result = arr.take(indices)
     return result
 
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -463,45 +463,51 @@ def factorize(self, na_sentinel=-1):
     # Indexing methods
     # ------------------------------------------------------------------------
 
-    def take(self, indexer, allow_fill=False, fill_value=None):
+    def take(self, indices, allow_fill=False, fill_value=None):
         # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
         """Take elements from an array.
 
         Parameters
         ----------
-        indexer : sequence of integers
+        indices : sequence of integers
             Indices to be taken. See Notes for how negative indicies
             are handled.
         allow_fill : bool, default False
-            How to handle negative values in `indexer`.
+            How to handle negative values in `indices`.
 
-            For False values (the default), negative values in `indexer`
+            For False values (the default), negative values in `indices`
             indiciate slices from the right.
 
-            For True values, indicies where `indexer` is ``-1`` indicate
+            For True values, indicies where `indices` is ``-1`` indicate
             missing values. These values are set to `fill_value`. Any other
             other negative value should raise a ``ValueError``.
         fill_value : any, optional
             Fill value to use for NA-indicies when `allow_fill` is True.
             This may be ``None``, in which case the default NA value for
             the type, ``self.dtype.na_value``, is used.
 
+            For many ExtensionArrays, there will be two representations of
+            `fill_value`: a user-facing "boxed" scalar, and a low-level
+            physical NA value. `fill_value` should be the user-facing version,
+            and the implementation should handle translating that to the
+            physical version for processing the take if nescessary.
+
         Returns
         -------
         ExtensionArray
 
         Raises
         ------
         IndexError
-            When the indexer is out of bounds for the array.
+            When the indices are out of bounds for the array.
         ValueError
-            When the indexer contains negative values other than ``-1``
+            When `indices` contains negative values other than ``-1``
             and `allow_fill` is True.
 
         Notes
         -----
         ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
-        ``iloc``, when the indexer is a sequence of values. Additionally,
+        ``iloc``, when `indices` is a sequence of values. Additionally,
         it's called by :meth:`Series.reindex`, or any other method
         that causes realignemnt, with a `fill_value`.
 
@@ -518,14 +524,17 @@ def take(self, indexer, allow_fill=False, fill_value=None):
 
         .. code-block:: python
 
-           def take(self, indexer, allow_fill=False, fill_value=None):
+           def take(self, indices, allow_fill=False, fill_value=None):
                from pandas.core.algorithms import take
 
+               # If the ExtensionArray is backed by an ndarray, then
+               # just pass that here instead of coercing to object.
                data = self.astype(object)
+
                if allow_fill and fill_value is None:
                    fill_value = self.dtype.na_value
 
-               result = take(data, indexer, fill_value=fill_value,
+               result = take(data, indices, fill_value=fill_value,
                              allow_fill=allow_fill)
                return self._from_sequence(result)
         """
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -17,7 +17,8 @@ class _DtypeOpsMixin(object):
     # class's methods can be moved to ExtensionDtype and removed.
 
     # na_value is the default NA value to use for this type. This is used in
-    # e.g. ExtensionArray.take.
+    # e.g. ExtensionArray.take. This should be the user-facing "boxed" version
+    # of the NA value, not the physical NA vaalue for storage.
     na_value = np.nan
 
     def __eq__(self, other):
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -5405,9 +5405,6 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
 
     for placement, join_units in concat_plan:
 
-        # The issue: we have a join unit (or maybe several) that needs to be
-        # reindexed.
-
         if len(join_units) == 1 and not join_units[0].indexers:
             b = join_units[0].block
             values = b.values
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
@@ -127,7 +127,11 @@ def test_take(self, data, na_value, na_cmp):
         result = data.take([0, -1])
         assert result.dtype == data.dtype
         assert result[0] == data[0]
-        na_cmp(result[1], na_value)
+        assert result[1] == data[-1]
+
+        result = data.take([0, -1], allow_fill=True, fill_value=na_value)
+        assert result[0] == data[0]
+        assert na_cmp(result[1], na_value)
 
         with tm.assert_raises_regex(IndexError, "out of bounds"):
             data.take([len(data) + 1])
@@ -136,7 +140,7 @@ def test_take_empty(self, data, na_value, na_cmp):
         empty = data[:0]
 
         result = empty.take([-1], allow_fill=True)
-        na_cmp(result[0], na_value)
+        assert na_cmp(result[0], na_value)
 
         with pytest.raises(IndexError):
             empty.take([-1])
@@ -170,7 +174,6 @@ def test_take_out_of_bounds_raises(self, data, allow_fill):
         with pytest.raises(IndexError):
             arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
 
-    @pytest.mark.xfail(reason="Series.take with extension array buggy for -1")
     def test_take_series(self, data):
         s = pd.Series(data)
         result = s.take([0, -1])
@@ -196,3 +199,14 @@ def test_reindex(self, data, na_value):
         expected = pd.Series(data._from_sequence([na_value, na_value]),
                              index=[n, n + 1])
         self.assert_series_equal(result, expected)
+
+    def test_reindex_non_na_fill_value(self, data_missing):
+        valid = data_missing[1]
+        na = data_missing[0]
+
+        array = data_missing._from_sequence([na, valid])
+        ser = pd.Series(array)
+        result = ser.reindex([0, 1, 2], fill_value=valid)
+        expected = pd.Series(data_missing._from_sequence([na, valid, valid]))
+
+        self.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -108,7 +108,15 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests):
 
 
 class TestGetitem(BaseDecimal, base.BaseGetitemTests):
-    pass
+
+    def test_take_na_value_other_decimal(self):
+        arr = DecimalArray([decimal.Decimal('1.0'),
+                            decimal.Decimal('2.0')])
+        result = arr.take([0, -1], allow_fill=True,
+                          fill_value=decimal.Decimal('-1.0'))
+        expected = DecimalArray([decimal.Decimal('1.0'),
+                                 decimal.Decimal('-1.0')])
+        self.assert_extension_array_equal(result, expected)
 
 
 class TestMissing(BaseDecimal, base.BaseMissingTests):
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
@@ -1,3 +1,15 @@
+"""Test extension array for storing nested data in a pandas container.
+
+The JSONArray stores lists of dictionaries. The storage mechanism is a list,
+not an ndarray.
+
+Note:
+
+We currently store lists of UserDicts (Py3 only). Pandas has a few places
+internally that specifically check for dicts, and does non-scalar things
+in that case. We *want* the dictionaries to be treated as scalars, so we
+hack around pandas by using UserDicts.
+"""
 import collections
 import itertools
 import numbers
@@ -125,12 +137,6 @@ def take(self, indexer, allow_fill=False, fill_value=None):
 
         return self._from_sequence(output)
 
-    # def astype(self, dtype, copy=True):
-    #     # NumPy has issues when all the dicts are the same length.
-    #     # np.array([UserDict(...), UserDict(...)]) fails,
-    #     # but np.array([{...}, {...}]) works, so cast.
-    #     return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
-
     def copy(self, deep=False):
         return type(self)(self.data[:])