Skip to content

COMPAT/PERF: lib.ismember_int64 on older numpies/cython not comparing correctly #11232 #11252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,21 @@ class series_isin_int64(object):
goal_time = 0.2

def setup(self):
self.s1 = Series(np.random.randn(10000))
self.s2 = Series(np.random.randint(1, 10, 10000))
self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
self.s4 = Series(np.random.randint(1, 100, 10000000)).astype('int64')
self.values = [1, 2]
self.s4 = self.s3.astype('object')

def time_series_isin_int64(self):
self.s3.isin(self.values)

def time_series_isin_int64_large(self):
self.s4.isin(self.values)


class series_isin_object(object):
goal_time = 0.2

def setup(self):
self.s1 = Series(np.random.randn(10000))
self.s2 = Series(np.random.randint(1, 10, 10000))
self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
self.values = [1, 2]
self.s4 = self.s3.astype('object')
Expand Down Expand Up @@ -71,4 +70,4 @@ def setup(self):

def time_series_nsmallest2(self):
self.s2.nsmallest(3, take_last=True)
self.s2.nsmallest(3, take_last=False)
self.s2.nsmallest(3, take_last=False)
2 changes: 1 addition & 1 deletion ci/requirements-2.6.build
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
numpy=1.7.0
numpy=1.7.1
cython=0.19.1
dateutil=1.5
pytz=2013b
6 changes: 4 additions & 2 deletions ci/requirements-2.6.run
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
numpy=1.7.0
numpy=1.7.1
dateutil=1.5
pytz=2013b
scipy=0.11.0
xlwt=0.7.5
xlrd=0.9.2
statsmodels=0.4.3
bottleneck=0.8.0
numexpr=2.2.2
pytables=3.0.0
html5lib=1.0b2
beautiful-soup=4.2.0
psycopg2=2.5.1
numexpr=1.4.2
pymysql=0.6.0
sqlalchemy=0.7.8
xlsxwriter=0.4.6
2 changes: 1 addition & 1 deletion ci/requirements-2.7.build
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dateutil=2.1
pytz=2013b
numpy=1.7.1
numpy
cython=0.19.1
10 changes: 5 additions & 5 deletions ci/requirements-2.7.run
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
dateutil=2.1
pytz=2013b
numpy=1.7.1
numpy
xlwt=0.7.5
numexpr=2.2.2
pytables=3.0.0
matplotlib=1.3.1
numexpr
pytables
matplotlib
openpyxl=1.6.2
xlrd=0.9.2
sqlalchemy=0.9.6
lxml=3.2.1
scipy
xlsxwriter=0.4.6
boto=2.36.0
bottleneck=0.8.0
bottleneck
psycopg2=2.5.2
patsy
pymysql=0.6.3
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-2.7_SLOW.build
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
python-dateutil
pytz
numpy
numpy=1.8.2
cython
4 changes: 2 additions & 2 deletions ci/requirements-2.7_SLOW.run
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
python-dateutil
pytz
numpy
matplotlib
numpy=1.8.2
matplotlib=1.3.1
scipy
patsy
statsmodels
Expand Down
2 changes: 1 addition & 1 deletion doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ Dependencies
------------

* `setuptools <http://pythonhosted.org/setuptools>`__
* `NumPy <http://www.numpy.org>`__: 1.7.0 or higher
* `NumPy <http://www.numpy.org>`__: 1.7.1 or higher
* `python-dateutil <http://labix.org/python-dateutil>`__ 1.5 or higher
* `pytz <http://pytz.sourceforge.net/>`__
* Needed for time zone support
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,7 @@ Bug Fixes
~~~~~~~~~

- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
- Bug in ``.isin`` on older numpies (:issue: `11232`)
- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
- Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`)
- Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`)
Expand Down
49 changes: 49 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from warnings import warn
import numpy as np

from pandas import compat, lib, _np_version_under1p8
import pandas.core.common as com
import pandas.algos as algos
import pandas.hashtable as htable
Expand Down Expand Up @@ -66,6 +67,54 @@ def unique(values):
return _hashtable_algo(f, values.dtype)


def isin(comps, values):
"""
Compute the isin boolean array

Parameters
----------
comps: array-like
values: array-like

Returns
-------
boolean array same length as comps
"""

if not com.is_list_like(comps):
raise TypeError("only list-like objects are allowed to be passed"
" to isin(), you passed a "
"[{0}]".format(type(comps).__name__))
comps = np.asarray(comps)
if not com.is_list_like(values):
raise TypeError("only list-like objects are allowed to be passed"
" to isin(), you passed a "
"[{0}]".format(type(values).__name__))

# GH11232
# work-around for numpy < 1.8 and comparisions on py3
# faster for larger cases to use np.in1d
if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
f = lambda x, y: np.in1d(x,np.asarray(list(y)))
else:
f = lambda x, y: lib.ismember_int64(x,set(y))

# may need i8 conversion for proper membership testing
if com.is_datetime64_dtype(comps):
from pandas.tseries.tools import to_datetime
values = to_datetime(values)._values.view('i8')
comps = comps.view('i8')
elif com.is_timedelta64_dtype(comps):
from pandas.tseries.timedeltas import to_timedelta
values = to_timedelta(values)._values.view('i8')
comps = comps.view('i8')
elif com.is_int64_dtype(comps):
pass
else:
f = lambda x, y: lib.ismember(x, set(values))

return f(comps, values)

def _hashtable_algo(f, dtype, return_dtype=None):
"""
f(HashTable, type_caster) -> result
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from pandas.compat import range, zip, lrange, lzip, u, map
from pandas import compat
from pandas.core import algorithms
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate, deprecate_kwarg)
Expand Down Expand Up @@ -108,7 +109,6 @@ class Index(IndexOpsMixin, PandasObject):
_is_numeric_dtype = False

_engine_type = _index.ObjectEngine
_isin_type = lib.ismember

def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
tupleize_cols=True, **kwargs):
Expand Down Expand Up @@ -1443,7 +1443,7 @@ def __add__(self, other):
return Index(np.array(self) + other)

def __radd__(self, other):
if com.is_list_like(other):
if is_list_like(other):
warnings.warn("using '+' to provide set union with Indexes is deprecated, "
"use '|' or .union()", FutureWarning, stacklevel=2)
return Index(other + np.array(self))
Expand Down Expand Up @@ -1995,10 +1995,9 @@ def isin(self, values, level=None):
is_contained : ndarray (boolean dtype)

"""
value_set = set(values)
if level is not None:
self._validate_index_level(level)
return self._isin_type(np.array(self), value_set)
return algorithms.isin(np.array(self), values)

def _can_reindex(self, indexer):
"""
Expand Down Expand Up @@ -3097,6 +3096,8 @@ def _is_dtype_compat(self, other):
raise TypeError("categories must match existing categories when appending")
else:
values = other
if not is_list_like(values):
values = [ values ]
other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered))
if not other.isin(values).all():
raise TypeError("cannot append a non-category item to a CategoricalIndex")
Expand Down Expand Up @@ -3580,7 +3581,6 @@ class Int64Index(NumericIndex):
_outer_indexer = _algos.outer_join_indexer_int64

_engine_type = _index.Int64Engine
_isin_type = lib.ismember_int64

def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):

Expand Down
38 changes: 7 additions & 31 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from pandas.compat import zip, u, OrderedDict, StringIO

import pandas.core.ops as ops
from pandas.core.algorithms import select_n
from pandas.core import algorithms

import pandas.core.common as com
import pandas.core.datetools as datetools
Expand Down Expand Up @@ -1156,8 +1156,7 @@ def mode(self):
modes : Series (sorted)
"""
# TODO: Add option for bins like value_counts()
from pandas.core.algorithms import mode
return mode(self)
return algorithms.mode(self)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
Expand Down Expand Up @@ -1812,9 +1811,8 @@ def rank(self, method='average', na_option='keep', ascending=True,
-------
ranks : Series
"""
from pandas.core.algorithms import rank
ranks = rank(self._values, method=method, na_option=na_option,
ascending=ascending, pct=pct)
ranks = algorithms.rank(self._values, method=method, na_option=na_option,
ascending=ascending, pct=pct)
return self._constructor(ranks, index=self.index).__finalize__(self)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
Expand Down Expand Up @@ -1852,7 +1850,7 @@ def nlargest(self, n=5, keep='first'):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nlargest(10) # only sorts up to the N requested
"""
return select_n(self, n=n, keep=keep, method='nlargest')
return algorithms.select_n(self, n=n, keep=keep, method='nlargest')

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
def nsmallest(self, n=5, keep='first'):
Expand Down Expand Up @@ -1889,7 +1887,7 @@ def nsmallest(self, n=5, keep='first'):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nsmallest(10) # only sorts up to the N requested
"""
return select_n(self, n=n, keep=keep, method='nsmallest')
return algorithms.select_n(self, n=n, keep=keep, method='nsmallest')

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
"""
Expand Down Expand Up @@ -2353,29 +2351,7 @@ def isin(self, values):
dtype: bool

"""
if not com.is_list_like(values):
raise TypeError("only list-like objects are allowed to be passed"
" to Series.isin(), you passed a "
"{0!r}".format(type(values).__name__))

# may need i8 conversion for proper membership testing
comps = _values_from_object(self)
f = lib.ismember
if com.is_datetime64_dtype(self):
from pandas.tseries.tools import to_datetime
values = Series(to_datetime(values))._values.view('i8')
comps = comps.view('i8')
f = lib.ismember_int64
elif com.is_timedelta64_dtype(self):
from pandas.tseries.timedeltas import to_timedelta
values = Series(to_timedelta(values))._values.view('i8')
comps = comps.view('i8')
f = lib.ismember_int64
elif is_int64_dtype(self):
f = lib.ismember_int64

value_set = set(values)
result = f(comps, value_set)
result = algorithms.isin(_values_from_object(self), values)
return self._constructor(result, index=self.index).__finalize__(self)

def between(self, left, right, inclusive=True):
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,69 @@ def test_timedelta64_dtype_array_returned(self):
tm.assert_numpy_array_equal(result, expected)
self.assertEqual(result.dtype, expected.dtype)

class TestIsin(tm.TestCase):
_multiprocess_can_split_ = True

def test_invalid(self):

self.assertRaises(TypeError, lambda : algos.isin(1,1))
self.assertRaises(TypeError, lambda : algos.isin(1,[1]))
self.assertRaises(TypeError, lambda : algos.isin([1],1))

def test_basic(self):

result = algos.isin([1,2],[1])
expected = np.array([True,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(np.array([1,2]),[1])
expected = np.array([True,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(pd.Series([1,2]),[1])
expected = np.array([True,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(pd.Series([1,2]),pd.Series([1]))
expected = np.array([True,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(['a','b'],['a'])
expected = np.array([True,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(pd.Series(['a','b']),pd.Series(['a']))
expected = np.array([True,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(['a','b'],[1])
expected = np.array([False,False])
tm.assert_numpy_array_equal(result, expected)

arr = pd.date_range('20130101',periods=3).values
result = algos.isin(arr,[arr[0]])
expected = np.array([True,False,False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(arr,arr[0:2])
expected = np.array([True,True,False])
tm.assert_numpy_array_equal(result, expected)

arr = pd.timedelta_range('1 day',periods=3).values
result = algos.isin(arr,[arr[0]])
expected = np.array([True,False,False])
tm.assert_numpy_array_equal(result, expected)



def test_large(self):

s = pd.date_range('20000101',periods=2000000,freq='s').values
result = algos.isin(s,s[0:2])
expected = np.zeros(len(s),dtype=bool)
expected[0] = True
expected[1] = True
tm.assert_numpy_array_equal(result, expected)

class TestValueCounts(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down
Loading