pandas-dev · jcrist · Jan 16, 2017 · Jan 17, 2017 · jreback · Jan 17, 2017
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -309,6 +309,7 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
+- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values.
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 
@@ -369,4 +370,4 @@ Bug Fixes
 - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`)
 - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
 
-- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
+- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from pandas import _hash, Series, factorize, Categorical, Index
-from pandas.lib import infer_dtype
+from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import is_categorical_dtype
 
@@ -68,7 +68,7 @@ def adder(h, hashed_to_add):
     return h
 
 
-def hash_array(vals, encoding='utf8', hash_key=None):
+def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     """
     Given a 1d array, return an array of deterministic integers.
 
@@ -80,53 +80,56 @@ def hash_array(vals, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
 
     Returns
     -------
     1d uint64 numpy array of hash values, same length as the vals
 
     """
 
-    # work with cagegoricals as ints. (This check is above the complex
-    # check so that we don't ask numpy if categorical is a subdtype of
-    # complex, as it will choke.
     if hash_key is None:
         hash_key = _default_hash_key
 
+    # For categoricals, we hash the categories, then remap the codes to the
+    # hash values. (This check is above the complex check so that we don't ask
+    # numpy if categorical is a subdtype of complex, as it will choke.
     if is_categorical_dtype(vals.dtype):
-        vals = vals.codes
+        cat_hashed = hash_array(vals.categories.values, encoding, hash_key,
+                                categorize=False).astype(np.uint64, copy=False)
+        # Since `cat_hashed` is already distributed in the space of uint64s,
+        # we can just return after remapping the codes here
+        c = Series(vals)
+        return c.cat.rename_categories(cat_hashed).values.astype(np.uint64)
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
     if np.issubdtype(vals.dtype, np.complex128):
         return hash_array(vals.real) + 23 * hash_array(vals.imag)
 
-    # MAIN LOGIC:
-    inferred = infer_dtype(vals)
-
     # First, turn whatever array this is into unsigned 64-bit ints, if we can
     # manage it.
-    if inferred == 'boolean':
+    if is_bool_array(vals):
         vals = vals.astype('u8')
-
-    if (np.issubdtype(vals.dtype, np.datetime64) or
-       np.issubdtype(vals.dtype, np.timedelta64) or
-       np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
-
+    elif (np.issubdtype(vals.dtype, np.datetime64) or
+          np.issubdtype(vals.dtype, np.timedelta64) or
+          np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
-
-        # its MUCH faster to categorize object dtypes, then hash and rename
-        codes, categories = factorize(vals, sort=False)
-        categories = Index(categories)
-        c = Series(Categorical(codes, categories,
-                               ordered=False, fastpath=True))
-        vals = _hash.hash_object_array(categories.values,
-                                       hash_key,
-                                       encoding)
-
-        # rename & extract
-        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
+        # With repeated values, its MUCH faster to categorize object dtypes,
+        # then hash and rename categories. We allow skipping the categorization
+        # when the values are known/likely to be unique.
+        if categorize:
+            codes, categories = factorize(vals, sort=False)
+            c = Series(Categorical(codes, Index(categories),
+                                   ordered=False, fastpath=True))
+            vals = _hash.hash_object_array(categories, hash_key, encoding)
+            # rename & extract
+            vals = c.cat.rename_categories(vals).values.astype(np.uint64)
+        else:
+            vals = _hash.hash_object_array(vals, hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30

diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -90,6 +90,20 @@ def test_hash_pandas_empty_object(self):
             # these are by-definition the same with
             # or w/o the index as the data is empty
 
+    def test_categorical_consistency(self):
+        # Check that categoricals hash consistent with their values, not codes
+        # This should work for categoricals of any dtype
+        for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]:
+            s1 = Series(data)
+            s2 = s1.astype('category').cat.set_categories(data)
+            s3 = s2.cat.set_categories(list(reversed(data)))
+            # These should all hash identically
+            h1 = hash_pandas_object(s1)
+            h2 = hash_pandas_object(s2)
+            h3 = hash_pandas_object(s3)
+            tm.assert_series_equal(h1, h2)
+            tm.assert_series_equal(h1, h3)
+
     def test_errors(self):
 
         for obj in [pd.Timestamp('20130101'), tm.makePanel()]: