Refactor map to use common code for series and index when possible and add dict performance test

nateyoder · nateyoder · commit 99e11b4ba154 · 2017-04-22T17:12:44.000-07:00
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -120,3 +120,27 @@ def setup(self):
 
     def time_series_dropna_datetime(self):
         self.s.dropna()
+
+
+class series_map_dict(object):
+    goal_time = 0.2
+
+    def setup(self):
+        map_size = 1000
+        self.s = Series(np.random.randint(0, map_size, 10000))
+        self.map_dict = {i: map_size - i for i in range(map_size)}
+
+    def time_series_map_dict(self):
+        self.s.map(self.map_dict)
+
+
+class series_map_series(object):
+    goal_time = 0.2
+
+    def setup(self):
+        map_size = 1000
+        self.s = Series(np.random.randint(0, map_size, 10000))
+        self.map_series = Series(map_size - np.arange(map_size))
+
+    def time_series_map_series(self):
+        self.s.map(self.map_series)
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -8,10 +8,10 @@
 
 from pandas.core.dtypes.missing import isnull
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass
-from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar
+from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar, is_extension_type
 from pandas.util.validators import validate_bool_kwarg
 
-from pandas.core import common as com
+from pandas.core import common as com, algorithms
 import pandas.core.nanops as nanops
 import pandas._libs.lib as lib
 from pandas.compat.numpy import function as nv
@@ -933,6 +933,51 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
                             klass=self.__class__.__name__, op=name))
         return func(**kwds)
 
+    def _map_values(self, values, arg, na_action=None):
+        if is_extension_type(self.dtype):
+            if na_action is not None:
+                raise NotImplementedError
+            map_f = lambda values, f: values.map(f)
+        else:
+            if na_action == 'ignore':
+                def map_f(values, f):
+                    return lib.map_infer_mask(values, f,
+                                              isnull(values).view(np.uint8))
+            else:
+                map_f = lib.map_infer
+
+        map_values = None
+        if isinstance(arg, dict):
+            if hasattr(arg, '__missing__'):
+                # If a dictionary subclass defines a default value method,
+                # convert arg to a lookup function (GH #15999).
+                dict_with_default = arg
+                arg = lambda x: dict_with_default[x]
+            else:
+                # Dictionary does not have a default. Thus it's safe to
+                # convert to an Index for efficiency.
+                from pandas import Index
+                idx = Index(arg.keys())
+                # Cast to dict so we can get values using lib.fast_multiget
+                #   if this is a dict subclass (GH #15999)
+                map_values = idx._get_values_from_dict(dict(arg))
+                arg = idx
+        elif isinstance(arg, ABCSeries):
+            map_values = arg.values
+            arg = arg.index
+
+        if map_values is not None:
+            # Since values were input this means we came from either
+            # a dict or a series and arg should be an index
+            indexer = arg.get_indexer(values)
+            new_values = algorithms.take_1d(map_values, indexer)
+        else:
+            # arg is a function
+            new_values = map_f(values, arg)
+
+        return new_values
+
+
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
         """
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2674,7 +2674,7 @@ def get_indexer_for(self, target, **kwargs):
 
         Parameters
         ----------
-        data : dict
+        data : {dict, DictWithoutMissing}
             The dictionary from which to extract the values
 
         Returns
@@ -2726,43 +2726,36 @@ def groupby(self, values):
 
         return result
 
-    def map(self, mapper):
-        """Apply mapper function to an index.
+    def map(self, arg, na_action=None):
+        """Map values of Series using input correspondence (which can be a
+        dict, Series, or function)
 
         Parameters
         ----------
-        mapper : {callable, dict, Series}
-            Function to be applied or input correspondence object.
-            dict and Series support new in 0.20.0.
+        arg : function, dict, or Series
+        na_action : {None, 'ignore'}
+            If 'ignore', propagate NA values, without passing them to the
+            mapping function
 
         Returns
         -------
-        applied : Union[Index, MultiIndex], inferred
+        applied : {Index, MultiIndex}, inferred
             The output of the mapping function applied to the index.
             If the function returns a tuple with more than one element
             a MultiIndex will be returned.
 
         """
-        from .multi import MultiIndex
-
-        if isinstance(mapper, ABCSeries):
-            indexer = mapper.index.get_indexer(self.values)
-            mapped_values = algos.take_1d(mapper.values, indexer)
-        elif isinstance(mapper, dict):
-            idx = Index(mapper.keys())
-            data = idx._get_values_from_dict(mapper)
-            indexer = idx.get_indexer(self.values)
-            mapped_values = algos.take_1d(data, indexer)
-        else:
-            mapped_values = self._arrmap(self.values, mapper)
 
+        from .multi import MultiIndex
+        new_values = super(Index, self)._map_values(
+            self.values, arg, na_action=na_action)
         attributes = self._get_attributes_dict()
-        if mapped_values.size and isinstance(mapped_values[0], tuple):
-            return MultiIndex.from_tuples(mapped_values,
+        if new_values.size and isinstance(new_values[0], tuple):
+            return MultiIndex.from_tuples(new_values,
                                           names=attributes.get('name'))
 
         attributes['copy'] = False
-        return Index(mapped_values, **attributes)
+        return Index(new_values, **attributes)
 
     def isin(self, values, level=None):
         """
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2105,43 +2105,13 @@ def map(self, arg, na_action=None):
         3    0
         dtype: int64
         """
-
         if is_extension_type(self.dtype):
-            values = self._values
-            if na_action is not None:
-                raise NotImplementedError
-            map_f = lambda values, f: values.map(f)
+            input_values = self._values
         else:
-            values = self.asobject
-
-            if na_action == 'ignore':
-                def map_f(values, f):
-                    return lib.map_infer_mask(values, f,
-                                              isnull(values).view(np.uint8))
-            else:
-                map_f = lib.map_infer
-
-        if isinstance(arg, dict):
-            if hasattr(arg, '__missing__'):
-                # If a dictionary subclass defines a default value method,
-                # convert arg to a lookup function (GH #15999).
-                dict_with_default = arg
-                arg = lambda x: dict_with_default[x]
-            else:
-                # Dictionary does not have a default. Thus it's safe to
-                # convert to an indexed series for efficiency.
-                arg = self._constructor(arg, index=arg.keys())
-
-        if isinstance(arg, Series):
-            # arg is a Series
-            indexer = arg.index.get_indexer(values)
-            new_values = algorithms.take_1d(arg._values, indexer)
-        else:
-            # arg is a function
-            new_values = map_f(values, arg)
-
-        return self._constructor(new_values,
-                                 index=self.index).__finalize__(self)
+            input_values = self.asobject
+        new_values = super(Series, self)._map_values(
+            input_values, arg, na_action=na_action)
+        return self._constructor(new_values, index=self.index).__finalize__(self)
 
     def _gotitem(self, key, ndim, subset=None):
         """
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -4,6 +4,8 @@
 
 from datetime import datetime, timedelta
 
+from collections import defaultdict
+
 import pandas.util.testing as tm
 from pandas.core.indexes.api import Index, MultiIndex
 from pandas.tests.indexes.common import Base
@@ -860,6 +862,21 @@ def test_map_with_non_function_missing_values(self):
         mapper = {0: 'foo', 2: 2.0, -1: 'baz'}
         tm.assert_index_equal(expected, input.map(mapper))
 
+    def test_map_na_exclusion(self):
+        idx = Index([1.5, np.nan, 3, np.nan, 5])
+
+        result = idx.map(lambda x: x * 2, na_action='ignore')
+        exp = idx * 2
+        tm.assert_index_equal(result, exp)
+
+    def test_map_defaultdict(self):
+        idx = Index([1, 2, 3])
+        default_dict = defaultdict(lambda: 'blank')
+        default_dict[1] = 'stuff'
+        result = idx.map(default_dict)
+        expected = Index(['stuff', 'blank', 'blank'])
+        tm.assert_index_equal(result, expected)
+
     def test_append_multiple(self):
         index = Index(['a', 'b', 'c', 'd', 'e', 'f'])