pandas-dev · jcrist · Jan 16, 2017 · Jan 17, 2017 · jreback · Jan 17, 2017
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -309,6 +309,7 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
+- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 
@@ -369,4 +370,4 @@ Bug Fixes
 - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`)
 - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
 
-- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
+- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -4,15 +4,16 @@
 
 import numpy as np
 from pandas import _hash, Series, factorize, Categorical, Index
-from pandas.lib import infer_dtype
+from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import is_categorical_dtype
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
 
 
-def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
+def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
+                       categorize=True):
     """
     Return a data hash of the Index/Series/DataFrame
 
@@ -25,6 +26,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
@@ -39,36 +45,49 @@ def adder(h, hashed_to_add):
         return np.add(h, hashed_to_add, h)
 
     if isinstance(obj, ABCIndexClass):
-        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = hash_array(obj.values, encoding, hash_key,
+                       categorize).astype('uint64')
         h = Series(h, index=obj, dtype='uint64')
     elif isinstance(obj, ABCSeries):
-        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = hash_array(obj.values, encoding, hash_key,
+                       categorize).astype('uint64')
         if index:
             h = adder(h, hash_pandas_object(obj.index,
                                             index=False,
                                             encoding=encoding,
-                                            hash_key=hash_key).values)
+                                            hash_key=hash_key,
+                                            categorize=categorize).values)
         h = Series(h, index=obj.index, dtype='uint64')
     elif isinstance(obj, ABCDataFrame):
         cols = obj.iteritems()
         first_series = next(cols)[1]
         h = hash_array(first_series.values, encoding,
-                       hash_key).astype('uint64')
+                       hash_key, categorize).astype('uint64')
         for _, col in cols:
-            h = adder(h, hash_array(col.values, encoding, hash_key))
+            h = adder(h, hash_array(col.values, encoding, hash_key,
+                                    categorize))
         if index:
             h = adder(h, hash_pandas_object(obj.index,
                                             index=False,
                                             encoding=encoding,
-                                            hash_key=hash_key).values)
+                                            hash_key=hash_key,
+                                            categorize=categorize).values)
 
         h = Series(h, index=obj.index, dtype='uint64')
     else:
         raise TypeError("Unexpected type for hashing %s" % type(obj))
     return h
 
 
-def hash_array(vals, encoding='utf8', hash_key=None):
+def _hash_categorical(c, encoding, hash_key):
+    """Hash a Categorical by hashing its categories, and then mapping the codes
+    to the hashes"""
+    cat_hashed = hash_array(c.categories.values, encoding, hash_key,
+                            categorize=False).astype(np.uint64, copy=False)
+    return c.rename_categories(cat_hashed).astype(np.uint64)
+
+
+def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     """
     Given a 1d array, return an array of deterministic integers.
 
@@ -80,53 +99,51 @@ def hash_array(vals, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
     1d uint64 numpy array of hash values, same length as the vals
 
     """
 
-    # work with cagegoricals as ints. (This check is above the complex
-    # check so that we don't ask numpy if categorical is a subdtype of
-    # complex, as it will choke.
     if hash_key is None:
         hash_key = _default_hash_key
 
+    # For categoricals, we hash the categories, then remap the codes to the
+    # hash values. (This check is above the complex check so that we don't ask
+    # numpy if categorical is a subdtype of complex, as it will choke.
     if is_categorical_dtype(vals.dtype):
-        vals = vals.codes
+        return _hash_categorical(vals, encoding, hash_key)
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
     if np.issubdtype(vals.dtype, np.complex128):
         return hash_array(vals.real) + 23 * hash_array(vals.imag)
 
-    # MAIN LOGIC:
-    inferred = infer_dtype(vals)
-
     # First, turn whatever array this is into unsigned 64-bit ints, if we can
     # manage it.
-    if inferred == 'boolean':
+    if is_bool_array(vals):
         vals = vals.astype('u8')
-
-    if (np.issubdtype(vals.dtype, np.datetime64) or
-       np.issubdtype(vals.dtype, np.timedelta64) or
-       np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
-
+    elif (np.issubdtype(vals.dtype, np.datetime64) or
+          np.issubdtype(vals.dtype, np.timedelta64) or
+          np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
-
-        # its MUCH faster to categorize object dtypes, then hash and rename
-        codes, categories = factorize(vals, sort=False)
-        categories = Index(categories)
-        c = Series(Categorical(codes, categories,
-                               ordered=False, fastpath=True))
-        vals = _hash.hash_object_array(categories.values,
-                                       hash_key,
-                                       encoding)
-
-        # rename & extract
-        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
+        # With repeated values, its MUCH faster to categorize object dtypes,
+        # then hash and rename categories. We allow skipping the categorization
+        # when the values are known/likely to be unique.
+        if categorize:
+            codes, categories = factorize(vals, sort=False)
+            cat = Categorical(codes, Index(categories),
+                              ordered=False, fastpath=True)
+            return _hash_categorical(cat, encoding, hash_key)
+        else:
+            vals = _hash.hash_object_array(vals, hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30

diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -90,6 +90,22 @@ def test_hash_pandas_empty_object(self):
             # these are by-definition the same with
             # or w/o the index as the data is empty
 
+    def test_categorical_consistency(self):
+        # Check that categoricals hash consistent with their values, not codes
+        # This should work for categoricals of any dtype
+        for s1 in [Series(['a', 'b', 'c', 'd']),
+                   Series([1000, 2000, 3000, 4000]),
+                   Series(pd.date_range(0, periods=4))]:
+            s2 = s1.astype('category').cat.set_categories(s1)
+            s3 = s2.cat.set_categories(list(reversed(s1)))
+            for categorize in [True, False]:
+                # These should all hash identically
+                h1 = hash_pandas_object(s1, categorize=categorize)
+                h2 = hash_pandas_object(s2, categorize=categorize)
+                h3 = hash_pandas_object(s3, categorize=categorize)
+                tm.assert_series_equal(h1, h2)
+                tm.assert_series_equal(h1, h3)
+
     def test_errors(self):
 
         for obj in [pd.Timestamp('20130101'), tm.makePanel()]: