pandas-dev · jreback · Oct 6, 2015 · Oct 6, 2015
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -5,22 +5,21 @@ class series_isin_int64(object):
     goal_time = 0.2
 
     def setup(self):
-        self.s1 = Series(np.random.randn(10000))
-        self.s2 = Series(np.random.randint(1, 10, 10000))
         self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
+        self.s4 = Series(np.random.randint(1, 100, 10000000)).astype('int64')
         self.values = [1, 2]
-        self.s4 = self.s3.astype('object')
 
     def time_series_isin_int64(self):
         self.s3.isin(self.values)
 
+    def time_series_isin_int64_large(self):
+        self.s4.isin(self.values)
+
 
 class series_isin_object(object):
     goal_time = 0.2
 
     def setup(self):
-        self.s1 = Series(np.random.randn(10000))
-        self.s2 = Series(np.random.randint(1, 10, 10000))
         self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
         self.values = [1, 2]
         self.s4 = self.s3.astype('object')
@@ -71,4 +70,4 @@ def setup(self):
 
     def time_series_nsmallest2(self):
         self.s2.nsmallest(3, take_last=True)
-        self.s2.nsmallest(3, take_last=False)
+        self.s2.nsmallest(3, take_last=False)
diff --git a/ci/requirements-2.6.build b/ci/requirements-2.6.build
@@ -1,4 +1,4 @@
-numpy=1.7.0
+numpy=1.7.1
 cython=0.19.1
 dateutil=1.5
 pytz=2013b
diff --git a/ci/requirements-2.6.run b/ci/requirements-2.6.run
@@ -1,14 +1,16 @@
-numpy=1.7.0
+numpy=1.7.1
 dateutil=1.5
 pytz=2013b
 scipy=0.11.0
 xlwt=0.7.5
 xlrd=0.9.2
 statsmodels=0.4.3
+bottleneck=0.8.0
+numexpr=2.2.2
+pytables=3.0.0
 html5lib=1.0b2
 beautiful-soup=4.2.0
 psycopg2=2.5.1
-numexpr=1.4.2
 pymysql=0.6.0
 sqlalchemy=0.7.8
 xlsxwriter=0.4.6
diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build
@@ -1,4 +1,4 @@
 dateutil=2.1
 pytz=2013b
-numpy=1.7.1
+numpy
 cython=0.19.1
diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run
@@ -1,18 +1,18 @@
 dateutil=2.1
 pytz=2013b
-numpy=1.7.1
+numpy
 xlwt=0.7.5
-numexpr=2.2.2
-pytables=3.0.0
-matplotlib=1.3.1
+numexpr
+pytables
+matplotlib
 openpyxl=1.6.2
 xlrd=0.9.2
 sqlalchemy=0.9.6
 lxml=3.2.1
 scipy
 xlsxwriter=0.4.6
 boto=2.36.0
-bottleneck=0.8.0
+bottleneck
 psycopg2=2.5.2
 patsy
 pymysql=0.6.3

diff --git a/ci/requirements-2.7_SLOW.build b/ci/requirements-2.7_SLOW.build
@@ -1,4 +1,4 @@
 python-dateutil
 pytz
-numpy
+numpy=1.8.2
 cython
diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run
@@ -1,7 +1,7 @@
 python-dateutil
 pytz
-numpy
-matplotlib
+numpy=1.8.2
+matplotlib=1.3.1
 scipy
 patsy
 statsmodels

diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -214,7 +214,7 @@ Dependencies
 ------------
 
 * `setuptools <http://pythonhosted.org/setuptools>`__
-* `NumPy <http://www.numpy.org>`__: 1.7.0 or higher
+* `NumPy <http://www.numpy.org>`__: 1.7.1 or higher
 * `python-dateutil <http://labix.org/python-dateutil>`__ 1.5 or higher
 * `pytz <http://pytz.sourceforge.net/>`__
    * Needed for time zone support

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -1034,6 +1034,7 @@ Bug Fixes
 ~~~~~~~~~
 
 - Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
+- Bug in  ``.isin`` on older numpies (:issue: `11232`)
 - Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
 - Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`)
 - Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -6,6 +6,7 @@
 from warnings import warn
 import numpy as np
 
+from pandas import compat, lib, _np_version_under1p8
 import pandas.core.common as com
 import pandas.algos as algos
 import pandas.hashtable as htable
@@ -66,6 +67,54 @@ def unique(values):
     return _hashtable_algo(f, values.dtype)
 
 
+def isin(comps, values):
+    """
+    Compute the isin boolean array
+
+    Parameters
+    ----------
+    comps: array-like
+    values: array-like
+
+    Returns
+    -------
+    boolean array same length as comps
+    """
+
+    if not com.is_list_like(comps):
+        raise TypeError("only list-like objects are allowed to be passed"
+                        " to isin(), you passed a "
+                        "[{0}]".format(type(comps).__name__))
+    comps = np.asarray(comps)
+    if not com.is_list_like(values):
+        raise TypeError("only list-like objects are allowed to be passed"
+                        " to isin(), you passed a "
+                        "[{0}]".format(type(values).__name__))
+
+    # GH11232
+    # work-around for numpy < 1.8 and comparisions on py3
+    # faster for larger cases to use np.in1d
+    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
+        f = lambda x, y: np.in1d(x,np.asarray(list(y)))
+    else:
+        f = lambda x, y: lib.ismember_int64(x,set(y))
+
+    # may need i8 conversion for proper membership testing
+    if com.is_datetime64_dtype(comps):
+        from pandas.tseries.tools import to_datetime
+        values = to_datetime(values)._values.view('i8')
+        comps = comps.view('i8')
+    elif com.is_timedelta64_dtype(comps):
+        from pandas.tseries.timedeltas import to_timedelta
+        values = to_timedelta(values)._values.view('i8')
+        comps = comps.view('i8')
+    elif com.is_int64_dtype(comps):
+        pass
+    else:
+        f = lambda x, y: lib.ismember(x, set(values))
+
+    return f(comps, values)
+
 def _hashtable_algo(f, dtype, return_dtype=None):
     """
     f(HashTable, type_caster) -> result

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -14,6 +14,7 @@
 
 from pandas.compat import range, zip, lrange, lzip, u, map
 from pandas import compat
+from pandas.core import algorithms
 from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
 from pandas.util.decorators import (Appender, Substitution, cache_readonly,
                                     deprecate, deprecate_kwarg)
@@ -108,7 +109,6 @@ class Index(IndexOpsMixin, PandasObject):
     _is_numeric_dtype = False
 
     _engine_type = _index.ObjectEngine
-    _isin_type = lib.ismember
 
     def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
                 tupleize_cols=True, **kwargs):
@@ -1443,7 +1443,7 @@ def __add__(self, other):
         return Index(np.array(self) + other)
 
     def __radd__(self, other):
-        if com.is_list_like(other):
+        if is_list_like(other):
             warnings.warn("using '+' to provide set union with Indexes is deprecated, "
                           "use '|' or .union()", FutureWarning, stacklevel=2)
         return Index(other + np.array(self))
@@ -1995,10 +1995,9 @@ def isin(self, values, level=None):
         is_contained : ndarray (boolean dtype)
 
         """
-        value_set = set(values)
         if level is not None:
             self._validate_index_level(level)
-        return self._isin_type(np.array(self), value_set)
+        return algorithms.isin(np.array(self), values)
 
     def _can_reindex(self, indexer):
         """
@@ -3097,6 +3096,8 @@ def _is_dtype_compat(self, other):
                 raise TypeError("categories must match existing categories when appending")
         else:
             values = other
+            if not is_list_like(values):
+                values = [ values ]
             other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered))
             if not other.isin(values).all():
                 raise TypeError("cannot append a non-category item to a CategoricalIndex")
@@ -3580,7 +3581,6 @@ class Int64Index(NumericIndex):
     _outer_indexer = _algos.outer_join_indexer_int64
 
     _engine_type = _index.Int64Engine
-    _isin_type = lib.ismember_int64
 
     def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -42,7 +42,7 @@
 from pandas.compat import zip, u, OrderedDict, StringIO
 
 import pandas.core.ops as ops
-from pandas.core.algorithms import select_n
+from pandas.core import algorithms
 
 import pandas.core.common as com
 import pandas.core.datetools as datetools
@@ -1156,8 +1156,7 @@ def mode(self):
         modes : Series (sorted)
         """
         # TODO: Add option for bins like value_counts()
-        from pandas.core.algorithms import mode
-        return mode(self)
+        return algorithms.mode(self)
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
@@ -1812,9 +1811,8 @@ def rank(self, method='average', na_option='keep', ascending=True,
         -------
         ranks : Series
         """
-        from pandas.core.algorithms import rank
-        ranks = rank(self._values, method=method, na_option=na_option,
-                     ascending=ascending, pct=pct)
+        ranks = algorithms.rank(self._values, method=method, na_option=na_option,
+                                ascending=ascending, pct=pct)
         return self._constructor(ranks, index=self.index).__finalize__(self)
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@@ -1852,7 +1850,7 @@ def nlargest(self, n=5, keep='first'):
         >>> s = pd.Series(np.random.randn(1e6))
         >>> s.nlargest(10)  # only sorts up to the N requested
         """
-        return select_n(self, n=n, keep=keep, method='nlargest')
+        return algorithms.select_n(self, n=n, keep=keep, method='nlargest')
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     def nsmallest(self, n=5, keep='first'):
@@ -1889,7 +1887,7 @@ def nsmallest(self, n=5, keep='first'):
         >>> s = pd.Series(np.random.randn(1e6))
         >>> s.nsmallest(10)  # only sorts up to the N requested
         """
-        return select_n(self, n=n, keep=keep, method='nsmallest')
+        return algorithms.select_n(self, n=n, keep=keep, method='nsmallest')
 
     def sortlevel(self, level=0, ascending=True, sort_remaining=True):
         """
@@ -2353,29 +2351,7 @@ def isin(self, values):
         dtype: bool
 
         """
-        if not com.is_list_like(values):
-            raise TypeError("only list-like objects are allowed to be passed"
-                            " to Series.isin(), you passed a "
-                            "{0!r}".format(type(values).__name__))
-
-        # may need i8 conversion for proper membership testing
-        comps = _values_from_object(self)
-        f = lib.ismember
-        if com.is_datetime64_dtype(self):
-            from pandas.tseries.tools import to_datetime
-            values = Series(to_datetime(values))._values.view('i8')
-            comps = comps.view('i8')
-            f = lib.ismember_int64
-        elif com.is_timedelta64_dtype(self):
-            from pandas.tseries.timedeltas import to_timedelta
-            values = Series(to_timedelta(values))._values.view('i8')
-            comps = comps.view('i8')
-            f = lib.ismember_int64
-        elif is_int64_dtype(self):
-            f = lib.ismember_int64
-
-        value_set = set(values)
-        result = f(comps, value_set)
+        result = algorithms.isin(_values_from_object(self), values)
         return self._constructor(result, index=self.index).__finalize__(self)
 
     def between(self, left, right, inclusive=True):

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -278,7 +278,69 @@ def test_timedelta64_dtype_array_returned(self):
         tm.assert_numpy_array_equal(result, expected)
         self.assertEqual(result.dtype, expected.dtype)
 
+class TestIsin(tm.TestCase):
+    _multiprocess_can_split_ = True
+
+    def test_invalid(self):
+
+        self.assertRaises(TypeError, lambda : algos.isin(1,1))
+        self.assertRaises(TypeError, lambda : algos.isin(1,[1]))
+        self.assertRaises(TypeError, lambda : algos.isin([1],1))
+
+    def test_basic(self):
+
+        result = algos.isin([1,2],[1])
+        expected = np.array([True,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(np.array([1,2]),[1])
+        expected = np.array([True,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(pd.Series([1,2]),[1])
+        expected = np.array([True,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(pd.Series([1,2]),pd.Series([1]))
+        expected = np.array([True,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(['a','b'],['a'])
+        expected = np.array([True,False])
+        tm.assert_numpy_array_equal(result, expected)
 
+        result = algos.isin(pd.Series(['a','b']),pd.Series(['a']))
+        expected = np.array([True,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(['a','b'],[1])
+        expected = np.array([False,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        arr = pd.date_range('20130101',periods=3).values
+        result = algos.isin(arr,[arr[0]])
+        expected = np.array([True,False,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(arr,arr[0:2])
+        expected = np.array([True,True,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        arr = pd.timedelta_range('1 day',periods=3).values
+        result = algos.isin(arr,[arr[0]])
+        expected = np.array([True,False,False])
+        tm.assert_numpy_array_equal(result, expected)
+
+
+
+    def test_large(self):
+
+        s = pd.date_range('20000101',periods=2000000,freq='s').values
+        result = algos.isin(s,s[0:2])
+        expected = np.zeros(len(s),dtype=bool)
+        expected[0] = True
+        expected[1] = True
+        tm.assert_numpy_array_equal(result, expected)
 
 class TestValueCounts(tm.TestCase):
     _multiprocess_can_split_ = True