Skip to content

Categoricals hash consistently #15143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ Bug Fixes
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)

- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)

Expand Down Expand Up @@ -369,4 +370,4 @@ Bug Fixes
- Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`)
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)

- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
85 changes: 51 additions & 34 deletions pandas/tools/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

import numpy as np
from pandas import _hash, Series, factorize, Categorical, Index
from pandas.lib import infer_dtype
from pandas.lib import is_bool_array
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
from pandas.types.common import is_categorical_dtype

# 16 byte long hashing key
_default_hash_key = '0123456789123456'


def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
categorize=True):
"""
Return a data hash of the Index/Series/DataFrame

Expand All @@ -25,6 +26,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key
categorize : bool, default True
Whether to first categorize object arrays before hashing. This is more
efficient when the array contains duplicate values.

.. versionadded:: 0.20.0

Returns
-------
Expand All @@ -39,36 +45,49 @@ def adder(h, hashed_to_add):
return np.add(h, hashed_to_add, h)

if isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
h = Series(h, index=obj, dtype='uint64')
elif isinstance(obj, ABCSeries):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)
hash_key=hash_key,
categorize=categorize).values)
h = Series(h, index=obj.index, dtype='uint64')
elif isinstance(obj, ABCDataFrame):
cols = obj.iteritems()
first_series = next(cols)[1]
h = hash_array(first_series.values, encoding,
hash_key).astype('uint64')
hash_key, categorize).astype('uint64')
for _, col in cols:
h = adder(h, hash_array(col.values, encoding, hash_key))
h = adder(h, hash_array(col.values, encoding, hash_key,
categorize))
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)
hash_key=hash_key,
categorize=categorize).values)

h = Series(h, index=obj.index, dtype='uint64')
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h


def hash_array(vals, encoding='utf8', hash_key=None):
def _hash_categorical(c, encoding, hash_key):
"""Hash a Categorical by hashing its categories, and then mapping the codes
to the hashes"""
cat_hashed = hash_array(c.categories.values, encoding, hash_key,
categorize=False).astype(np.uint64, copy=False)
return c.rename_categories(cat_hashed).astype(np.uint64)


def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
"""
Given a 1d array, return an array of deterministic integers.

Expand All @@ -80,53 +99,51 @@ def hash_array(vals, encoding='utf8', hash_key=None):
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key
categorize : bool, default True
Whether to first categorize object arrays before hashing. This is more
efficient when the array contains duplicate values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag here (0.20.0)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think also add categorize to hash_pandas_object for consistency as well?

.. versionadded:: 0.20.0

Returns
-------
1d uint64 numpy array of hash values, same length as the vals

"""

# work with cagegoricals as ints. (This check is above the complex
# check so that we don't ask numpy if categorical is a subdtype of
# complex, as it will choke.
if hash_key is None:
hash_key = _default_hash_key

# For categoricals, we hash the categories, then remap the codes to the
# hash values. (This check is above the complex check so that we don't ask
# numpy if categorical is a subdtype of complex, as it will choke.
if is_categorical_dtype(vals.dtype):
vals = vals.codes
return _hash_categorical(vals, encoding, hash_key)

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)

# MAIN LOGIC:
inferred = infer_dtype(vals)

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if inferred == 'boolean':
if is_bool_array(vals):
vals = vals.astype('u8')

if (np.issubdtype(vals.dtype, np.datetime64) or
np.issubdtype(vals.dtype, np.timedelta64) or
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:

elif (np.issubdtype(vals.dtype, np.datetime64) or
np.issubdtype(vals.dtype, np.timedelta64) or
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:

# its MUCH faster to categorize object dtypes, then hash and rename
codes, categories = factorize(vals, sort=False)
categories = Index(categories)
c = Series(Categorical(codes, categories,
ordered=False, fastpath=True))
vals = _hash.hash_object_array(categories.values,
hash_key,
encoding)

# rename & extract
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
# With repeated values, its MUCH faster to categorize object dtypes,
# then hash and rename categories. We allow skipping the categorization
# when the values are known/likely to be unique.
if categorize:
codes, categories = factorize(vals, sort=False)
cat = Categorical(codes, Index(categories),
ordered=False, fastpath=True)
return _hash_categorical(cat, encoding, hash_key)
else:
vals = _hash.hash_object_array(vals, hash_key, encoding)

# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
Expand Down
16 changes: 16 additions & 0 deletions pandas/tools/tests/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,22 @@ def test_hash_pandas_empty_object(self):
# these are by-definition the same with
# or w/o the index as the data is empty

def test_categorical_consistency(self):
# Check that categoricals hash consistent with their values, not codes
# This should work for categoricals of any dtype
for s1 in [Series(['a', 'b', 'c', 'd']),
Series([1000, 2000, 3000, 4000]),
Series(pd.date_range(0, periods=4))]:
s2 = s1.astype('category').cat.set_categories(s1)
s3 = s2.cat.set_categories(list(reversed(s1)))
for categorize in [True, False]:
# These should all hash identically
h1 = hash_pandas_object(s1, categorize=categorize)
h2 = hash_pandas_object(s2, categorize=categorize)
h3 = hash_pandas_object(s3, categorize=categorize)
tm.assert_series_equal(h1, h2)
tm.assert_series_equal(h1, h3)

def test_errors(self):

for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
Expand Down