pandas-dev · jreback · Jul 17, 2015 · Jul 15, 2015 · shoyer · Jul 16, 2015
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -276,6 +276,7 @@ Performance Improvements
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
 - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
 - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
+- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
 
 .. _whatsnew_0170.bug_fixes:
 

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1715,18 +1715,20 @@ def _convert_to_list_like(list_like):
         return [list_like]
 
 def _concat_compat(to_concat, axis=0):
-    """
-    provide concatenation of an object/categorical array of arrays each of which is a single dtype
+    """Concatenate an object/categorical array of arrays, each of which is a
+    single dtype
 
     Parameters
     ----------
     to_concat : array of arrays
-    axis : axis to provide concatenation
-        in the current impl this is always 0, e.g. we only have 1-d categoricals
+    axis : int
+        Axis to provide concatenation in the current implementation this is
+        always 0, e.g. we only have 1D categoricals
 
     Returns
     -------
-    a single array, preserving the combined dtypes
+    Categorical
+        A single array, preserving the combined dtypes
     """
 
     def convert_categorical(x):
@@ -1735,31 +1737,34 @@ def convert_categorical(x):
             return x.get_values()
         return x.ravel()
 
-    typs = get_dtype_kinds(to_concat)
-    if not len(typs-set(['object','category'])):
-
-        # we only can deal with object & category types
-        pass
-
-    else:
-
+    if get_dtype_kinds(to_concat) - set(['object', 'category']):
         # convert to object type and perform a regular concat
         from pandas.core.common import _concat_compat
-        return _concat_compat([ np.array(x,copy=False).astype('object') for x in to_concat ],axis=0)
+        return _concat_compat([np.array(x, copy=False, dtype=object)
+                               for x in to_concat], axis=0)
 
-    # we could have object blocks and categorical's here
-    # if we only have a single cateogoricals then combine everything
+    # we could have object blocks and categoricals here
+    # if we only have a single categoricals then combine everything
     # else its a non-compat categorical
-    categoricals = [ x for x in to_concat if is_categorical_dtype(x.dtype) ]
-    objects = [ x for x in to_concat if is_object_dtype(x.dtype) ]
+    categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
 
     # validate the categories
-    categories = None
-    for x in categoricals:
-        if categories is None:
-            categories = x.categories
-        if not categories.equals(x.categories):
+    categories = categoricals[0]
+    rawcats = categories.categories
+    for x in categoricals[1:]:
+        if not categories.is_dtype_equal(x):
             raise ValueError("incompatible categories in categorical concat")
 
-    # concat them
-    return Categorical(np.concatenate([ convert_categorical(x) for x in to_concat ],axis=0), categories=categories)
+    # we've already checked that all categoricals are the same, so if their
+    # length is equal to the input then we have all the same categories
+    if len(categoricals) == len(to_concat):
+        # concating numeric types is much faster than concating object types
+        # and fastpath takes a shorter path through the constructor
+        return Categorical(np.concatenate([x.codes for x in to_concat], axis=0),
+                           rawcats,
+                           ordered=categoricals[0].ordered,
+                           fastpath=True)
+    else:
+        concatted = np.concatenate(list(map(convert_categorical, to_concat)),
+                                   axis=0)
+        return Categorical(concatted, rawcats)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -4388,7 +4388,11 @@ def is_null(self):
         # Usually it's enough to check but a small fraction of values to see if
         # a block is NOT null, chunks should help in such cases.  1000 value
         # was chosen rather arbitrarily.
-        values_flat = self.block.values.ravel()
+        values = self.block.values
+        if self.block.is_categorical:
+            values_flat = values.categories
+        else:
+            values_flat = values.ravel()
         total_len = values_flat.shape[0]
         chunk_len = max(total_len // 40, 1000)
         for i in range(0, total_len, chunk_len):

diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -21,6 +21,7 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
                       PyTuple_SetItem,
                       PyTuple_New,
                       PyObject_SetAttrString,
+                      PyObject_RichCompareBool,
                       PyBytes_GET_SIZE,
                       PyUnicode_GET_SIZE)
 
@@ -372,19 +373,19 @@ def isnullobj2d_old(ndarray[object, ndim=2] arr):
                 result[i, j] = 1
     return result.view(np.bool_)
 
-def list_to_object_array(list obj):
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef ndarray[object] list_to_object_array(list obj):
     '''
     Convert list to object ndarray. Seriously can't believe I had to write this
     function
     '''
     cdef:
-        Py_ssize_t i, n
-        ndarray[object] arr
-
-    n = len(obj)
-    arr = np.empty(n, dtype=object)
+        Py_ssize_t i, n = len(obj)
+        ndarray[object] arr = np.empty(n, dtype=object)
 
-    for i from 0 <= i < n:
+    for i in range(n):
         arr[i] = obj[i]
 
     return arr
@@ -732,28 +733,25 @@ def scalar_compare(ndarray[object] values, object val, object op):
 
     return result.view(bool)
 
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def array_equivalent_object(ndarray[object] left, ndarray[object] right):
+cpdef bint array_equivalent_object(object[:] left, object[:] right):
     """ perform an element by element comparion on 1-d object arrays
         taking into account nan positions """
-    cdef Py_ssize_t i, n
-    cdef object x, y
+    cdef:
+        Py_ssize_t i, n = left.shape[0]
+        object x, y
 
-    n = len(left)
-    for i from 0 <= i < n:
+    for i in range(n):
         x = left[i]
         y = right[i]
 
         # we are either not equal or both nan
         # I think None == None will be true here
-        if cpython.PyObject_RichCompareBool(x, y, cpython.Py_EQ):
-            continue
-        elif _checknull(x) and _checknull(y):
-            continue
-        else:
+        if not (PyObject_RichCompareBool(x, y, cpython.Py_EQ) or
+                _checknull(x) and _checknull(y)):
             return False
-
     return True
 
 

diff --git a/vb_suite/categoricals.py b/vb_suite/categoricals.py
@@ -0,0 +1,16 @@
+from vbench.benchmark import Benchmark
+from datetime import datetime
+
+common_setup = """from pandas_vb_common import *
+"""
+
+#----------------------------------------------------------------------
+# Series constructors
+
+setup = common_setup + """
+s = pd.Series(list('aabbcd') * 1000000).astype('category')
+"""
+
+concat_categorical = \
+    Benchmark("concat([s, s])", setup=setup, name='concat_categorical',
+              start_date=datetime(year=2015, month=7, day=15))