Skip to content

Commit c13dc7d

Browse files
committed
Merge pull request #11252 from jreback/isin
COMPAT/PERF: lib.ismember_int64 on older numpies/cython not comparing correctly #11232
2 parents ef9a79d + 7725766 commit c13dc7d

14 files changed

+146
-58
lines changed

asv_bench/benchmarks/series_methods.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,21 @@ class series_isin_int64(object):
55
goal_time = 0.2
66

77
def setup(self):
8-
self.s1 = Series(np.random.randn(10000))
9-
self.s2 = Series(np.random.randint(1, 10, 10000))
108
self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
9+
self.s4 = Series(np.random.randint(1, 100, 10000000)).astype('int64')
1110
self.values = [1, 2]
12-
self.s4 = self.s3.astype('object')
1311

1412
def time_series_isin_int64(self):
1513
self.s3.isin(self.values)
1614

15+
def time_series_isin_int64_large(self):
16+
self.s4.isin(self.values)
17+
1718

1819
class series_isin_object(object):
1920
goal_time = 0.2
2021

2122
def setup(self):
22-
self.s1 = Series(np.random.randn(10000))
23-
self.s2 = Series(np.random.randint(1, 10, 10000))
2423
self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
2524
self.values = [1, 2]
2625
self.s4 = self.s3.astype('object')
@@ -71,4 +70,4 @@ def setup(self):
7170

7271
def time_series_nsmallest2(self):
7372
self.s2.nsmallest(3, take_last=True)
74-
self.s2.nsmallest(3, take_last=False)
73+
self.s2.nsmallest(3, take_last=False)

ci/requirements-2.6.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
numpy=1.7.0
1+
numpy=1.7.1
22
cython=0.19.1
33
dateutil=1.5
44
pytz=2013b

ci/requirements-2.6.run

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
numpy=1.7.0
1+
numpy=1.7.1
22
dateutil=1.5
33
pytz=2013b
44
scipy=0.11.0
55
xlwt=0.7.5
66
xlrd=0.9.2
77
statsmodels=0.4.3
8+
bottleneck=0.8.0
9+
numexpr=2.2.2
10+
pytables=3.0.0
811
html5lib=1.0b2
912
beautiful-soup=4.2.0
1013
psycopg2=2.5.1
11-
numexpr=1.4.2
1214
pymysql=0.6.0
1315
sqlalchemy=0.7.8
1416
xlsxwriter=0.4.6

ci/requirements-2.7.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
dateutil=2.1
22
pytz=2013b
3-
numpy=1.7.1
3+
numpy
44
cython=0.19.1

ci/requirements-2.7.run

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
dateutil=2.1
22
pytz=2013b
3-
numpy=1.7.1
3+
numpy
44
xlwt=0.7.5
5-
numexpr=2.2.2
6-
pytables=3.0.0
7-
matplotlib=1.3.1
5+
numexpr
6+
pytables
7+
matplotlib
88
openpyxl=1.6.2
99
xlrd=0.9.2
1010
sqlalchemy=0.9.6
1111
lxml=3.2.1
1212
scipy
1313
xlsxwriter=0.4.6
1414
boto=2.36.0
15-
bottleneck=0.8.0
15+
bottleneck
1616
psycopg2=2.5.2
1717
patsy
1818
pymysql=0.6.3

ci/requirements-2.7_SLOW.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
python-dateutil
22
pytz
3-
numpy
3+
numpy=1.8.2
44
cython

ci/requirements-2.7_SLOW.run

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
python-dateutil
22
pytz
3-
numpy
4-
matplotlib
3+
numpy=1.8.2
4+
matplotlib=1.3.1
55
scipy
66
patsy
77
statsmodels

doc/source/install.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ Dependencies
214214
------------
215215

216216
* `setuptools <http://pythonhosted.org/setuptools>`__
217-
* `NumPy <http://www.numpy.org>`__: 1.7.0 or higher
217+
* `NumPy <http://www.numpy.org>`__: 1.7.1 or higher
218218
* `python-dateutil <http://labix.org/python-dateutil>`__ 1.5 or higher
219219
* `pytz <http://pytz.sourceforge.net/>`__
220220
* Needed for time zone support

doc/source/whatsnew/v0.17.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,7 @@ Bug Fixes
10341034
~~~~~~~~~
10351035

10361036
- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
1037+
- Bug in ``.isin`` on older numpies (:issue: `11232`)
10371038
- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
10381039
- Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`)
10391040
- Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`)

pandas/core/algorithms.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from warnings import warn
77
import numpy as np
88

9+
from pandas import compat, lib, _np_version_under1p8
910
import pandas.core.common as com
1011
import pandas.algos as algos
1112
import pandas.hashtable as htable
@@ -66,6 +67,54 @@ def unique(values):
6667
return _hashtable_algo(f, values.dtype)
6768

6869

70+
def isin(comps, values):
71+
"""
72+
Compute the isin boolean array
73+
74+
Parameters
75+
----------
76+
comps: array-like
77+
values: array-like
78+
79+
Returns
80+
-------
81+
boolean array same length as comps
82+
"""
83+
84+
if not com.is_list_like(comps):
85+
raise TypeError("only list-like objects are allowed to be passed"
86+
" to isin(), you passed a "
87+
"[{0}]".format(type(comps).__name__))
88+
comps = np.asarray(comps)
89+
if not com.is_list_like(values):
90+
raise TypeError("only list-like objects are allowed to be passed"
91+
" to isin(), you passed a "
92+
"[{0}]".format(type(values).__name__))
93+
94+
# GH11232
95+
# work-around for numpy < 1.8 and comparisions on py3
96+
# faster for larger cases to use np.in1d
97+
if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
98+
f = lambda x, y: np.in1d(x,np.asarray(list(y)))
99+
else:
100+
f = lambda x, y: lib.ismember_int64(x,set(y))
101+
102+
# may need i8 conversion for proper membership testing
103+
if com.is_datetime64_dtype(comps):
104+
from pandas.tseries.tools import to_datetime
105+
values = to_datetime(values)._values.view('i8')
106+
comps = comps.view('i8')
107+
elif com.is_timedelta64_dtype(comps):
108+
from pandas.tseries.timedeltas import to_timedelta
109+
values = to_timedelta(values)._values.view('i8')
110+
comps = comps.view('i8')
111+
elif com.is_int64_dtype(comps):
112+
pass
113+
else:
114+
f = lambda x, y: lib.ismember(x, set(values))
115+
116+
return f(comps, values)
117+
69118
def _hashtable_algo(f, dtype, return_dtype=None):
70119
"""
71120
f(HashTable, type_caster) -> result

pandas/core/index.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from pandas.compat import range, zip, lrange, lzip, u, map
1616
from pandas import compat
17+
from pandas.core import algorithms
1718
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
1819
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
1920
deprecate, deprecate_kwarg)
@@ -108,7 +109,6 @@ class Index(IndexOpsMixin, PandasObject):
108109
_is_numeric_dtype = False
109110

110111
_engine_type = _index.ObjectEngine
111-
_isin_type = lib.ismember
112112

113113
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
114114
tupleize_cols=True, **kwargs):
@@ -1443,7 +1443,7 @@ def __add__(self, other):
14431443
return Index(np.array(self) + other)
14441444

14451445
def __radd__(self, other):
1446-
if com.is_list_like(other):
1446+
if is_list_like(other):
14471447
warnings.warn("using '+' to provide set union with Indexes is deprecated, "
14481448
"use '|' or .union()", FutureWarning, stacklevel=2)
14491449
return Index(other + np.array(self))
@@ -1995,10 +1995,9 @@ def isin(self, values, level=None):
19951995
is_contained : ndarray (boolean dtype)
19961996
19971997
"""
1998-
value_set = set(values)
19991998
if level is not None:
20001999
self._validate_index_level(level)
2001-
return self._isin_type(np.array(self), value_set)
2000+
return algorithms.isin(np.array(self), values)
20022001

20032002
def _can_reindex(self, indexer):
20042003
"""
@@ -3097,6 +3096,8 @@ def _is_dtype_compat(self, other):
30973096
raise TypeError("categories must match existing categories when appending")
30983097
else:
30993098
values = other
3099+
if not is_list_like(values):
3100+
values = [ values ]
31003101
other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered))
31013102
if not other.isin(values).all():
31023103
raise TypeError("cannot append a non-category item to a CategoricalIndex")
@@ -3580,7 +3581,6 @@ class Int64Index(NumericIndex):
35803581
_outer_indexer = _algos.outer_join_indexer_int64
35813582

35823583
_engine_type = _index.Int64Engine
3583-
_isin_type = lib.ismember_int64
35843584

35853585
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
35863586

pandas/core/series.py

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
from pandas.compat import zip, u, OrderedDict, StringIO
4343

4444
import pandas.core.ops as ops
45-
from pandas.core.algorithms import select_n
45+
from pandas.core import algorithms
4646

4747
import pandas.core.common as com
4848
import pandas.core.datetools as datetools
@@ -1156,8 +1156,7 @@ def mode(self):
11561156
modes : Series (sorted)
11571157
"""
11581158
# TODO: Add option for bins like value_counts()
1159-
from pandas.core.algorithms import mode
1160-
return mode(self)
1159+
return algorithms.mode(self)
11611160

11621161
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
11631162
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
@@ -1812,9 +1811,8 @@ def rank(self, method='average', na_option='keep', ascending=True,
18121811
-------
18131812
ranks : Series
18141813
"""
1815-
from pandas.core.algorithms import rank
1816-
ranks = rank(self._values, method=method, na_option=na_option,
1817-
ascending=ascending, pct=pct)
1814+
ranks = algorithms.rank(self._values, method=method, na_option=na_option,
1815+
ascending=ascending, pct=pct)
18181816
return self._constructor(ranks, index=self.index).__finalize__(self)
18191817

18201818
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@@ -1852,7 +1850,7 @@ def nlargest(self, n=5, keep='first'):
18521850
>>> s = pd.Series(np.random.randn(1e6))
18531851
>>> s.nlargest(10) # only sorts up to the N requested
18541852
"""
1855-
return select_n(self, n=n, keep=keep, method='nlargest')
1853+
return algorithms.select_n(self, n=n, keep=keep, method='nlargest')
18561854

18571855
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
18581856
def nsmallest(self, n=5, keep='first'):
@@ -1889,7 +1887,7 @@ def nsmallest(self, n=5, keep='first'):
18891887
>>> s = pd.Series(np.random.randn(1e6))
18901888
>>> s.nsmallest(10) # only sorts up to the N requested
18911889
"""
1892-
return select_n(self, n=n, keep=keep, method='nsmallest')
1890+
return algorithms.select_n(self, n=n, keep=keep, method='nsmallest')
18931891

18941892
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
18951893
"""
@@ -2353,29 +2351,7 @@ def isin(self, values):
23532351
dtype: bool
23542352
23552353
"""
2356-
if not com.is_list_like(values):
2357-
raise TypeError("only list-like objects are allowed to be passed"
2358-
" to Series.isin(), you passed a "
2359-
"{0!r}".format(type(values).__name__))
2360-
2361-
# may need i8 conversion for proper membership testing
2362-
comps = _values_from_object(self)
2363-
f = lib.ismember
2364-
if com.is_datetime64_dtype(self):
2365-
from pandas.tseries.tools import to_datetime
2366-
values = Series(to_datetime(values))._values.view('i8')
2367-
comps = comps.view('i8')
2368-
f = lib.ismember_int64
2369-
elif com.is_timedelta64_dtype(self):
2370-
from pandas.tseries.timedeltas import to_timedelta
2371-
values = Series(to_timedelta(values))._values.view('i8')
2372-
comps = comps.view('i8')
2373-
f = lib.ismember_int64
2374-
elif is_int64_dtype(self):
2375-
f = lib.ismember_int64
2376-
2377-
value_set = set(values)
2378-
result = f(comps, value_set)
2354+
result = algorithms.isin(_values_from_object(self), values)
23792355
return self._constructor(result, index=self.index).__finalize__(self)
23802356

23812357
def between(self, left, right, inclusive=True):

pandas/tests/test_algos.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,69 @@ def test_timedelta64_dtype_array_returned(self):
278278
tm.assert_numpy_array_equal(result, expected)
279279
self.assertEqual(result.dtype, expected.dtype)
280280

281+
class TestIsin(tm.TestCase):
282+
_multiprocess_can_split_ = True
283+
284+
def test_invalid(self):
285+
286+
self.assertRaises(TypeError, lambda : algos.isin(1,1))
287+
self.assertRaises(TypeError, lambda : algos.isin(1,[1]))
288+
self.assertRaises(TypeError, lambda : algos.isin([1],1))
289+
290+
def test_basic(self):
291+
292+
result = algos.isin([1,2],[1])
293+
expected = np.array([True,False])
294+
tm.assert_numpy_array_equal(result, expected)
295+
296+
result = algos.isin(np.array([1,2]),[1])
297+
expected = np.array([True,False])
298+
tm.assert_numpy_array_equal(result, expected)
299+
300+
result = algos.isin(pd.Series([1,2]),[1])
301+
expected = np.array([True,False])
302+
tm.assert_numpy_array_equal(result, expected)
303+
304+
result = algos.isin(pd.Series([1,2]),pd.Series([1]))
305+
expected = np.array([True,False])
306+
tm.assert_numpy_array_equal(result, expected)
307+
308+
result = algos.isin(['a','b'],['a'])
309+
expected = np.array([True,False])
310+
tm.assert_numpy_array_equal(result, expected)
281311

312+
result = algos.isin(pd.Series(['a','b']),pd.Series(['a']))
313+
expected = np.array([True,False])
314+
tm.assert_numpy_array_equal(result, expected)
315+
316+
result = algos.isin(['a','b'],[1])
317+
expected = np.array([False,False])
318+
tm.assert_numpy_array_equal(result, expected)
319+
320+
arr = pd.date_range('20130101',periods=3).values
321+
result = algos.isin(arr,[arr[0]])
322+
expected = np.array([True,False,False])
323+
tm.assert_numpy_array_equal(result, expected)
324+
325+
result = algos.isin(arr,arr[0:2])
326+
expected = np.array([True,True,False])
327+
tm.assert_numpy_array_equal(result, expected)
328+
329+
arr = pd.timedelta_range('1 day',periods=3).values
330+
result = algos.isin(arr,[arr[0]])
331+
expected = np.array([True,False,False])
332+
tm.assert_numpy_array_equal(result, expected)
333+
334+
335+
336+
def test_large(self):
337+
338+
s = pd.date_range('20000101',periods=2000000,freq='s').values
339+
result = algos.isin(s,s[0:2])
340+
expected = np.zeros(len(s),dtype=bool)
341+
expected[0] = True
342+
expected[1] = True
343+
tm.assert_numpy_array_equal(result, expected)
282344

283345
class TestValueCounts(tm.TestCase):
284346
_multiprocess_can_split_ = True

0 commit comments

Comments
 (0)