Stop concat from attempting to sort mismatched columns by default

brycepg · brycepg · commit c859aab26ee8 · 2018-04-04T22:28:28.000-06:00
Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Close #4588
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1160,6 +1160,7 @@ Reshaping
 - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
 - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
 - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
+- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`)
 
 Other
 ^^^^^
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple_list(list lists):
+def fast_unique_multiple_list(list lists, bint sort=True):
     cdef:
         list buf
         Py_ssize_t k = len(lists)
@@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
             if val not in table:
                 table[val] = stub
                 uniques.append(val)
-    try:
-        uniques.sort()
-    except Exception:
-        pass
+    if sort:
+        try:
+            uniques.sort()
+        except Exception:
+            pass
 
     return uniques
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5982,7 +5982,8 @@ def infer(x):
     # ----------------------------------------------------------------------
     # Merging / joining methods
 
-    def append(self, other, ignore_index=False, verify_integrity=False):
+    def append(self, other, ignore_index=False,
+               verify_integrity=False, sort=False):
         """
         Append rows of `other` to the end of this frame, returning a new
         object. Columns not in this frame are added as new columns.
@@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
             If True, do not use the index labels.
         verify_integrity : boolean, default False
             If True, raise ValueError on creating index with duplicates.
+        sort: boolean, default False
+            Sort columns if given object doesn't have the same columns
 
         Returns
         -------
@@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
         else:
             to_concat = [self, other]
         return concat(to_concat, ignore_index=ignore_index,
-                      verify_integrity=verify_integrity)
+                      verify_integrity=verify_integrity,
+                      sort=sort)
 
     def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
              sort=False):
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -31,17 +31,17 @@
            '_all_indexes_same']
 
 
-def _get_objs_combined_axis(objs, intersect=False, axis=0):
+def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
     # Extract combined index: return intersection or union (depending on the
     # value of "intersect") of indexes on given axis, or None if all objects
     # lack indexes (e.g. they are numpy arrays)
     obs_idxes = [obj._get_axis(axis) for obj in objs
                  if hasattr(obj, '_get_axis')]
     if obs_idxes:
-        return _get_combined_index(obs_idxes, intersect=intersect)
+        return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
 
 
-def _get_combined_index(indexes, intersect=False):
+def _get_combined_index(indexes, intersect=False, sort=True):
     # TODO: handle index names!
     indexes = com._get_distinct_objs(indexes)
     if len(indexes) == 0:
@@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False):
         for other in indexes[1:]:
             index = index.intersection(other)
         return index
-    union = _union_indexes(indexes)
+    union = _union_indexes(indexes, sort=sort)
     return _ensure_index(union)
 
 
-def _union_indexes(indexes):
+def _union_indexes(indexes, sort=True):
     if len(indexes) == 0:
         raise AssertionError('Must have at least 1 Index to union')
     if len(indexes) == 1:
@@ -74,7 +74,8 @@ def conv(i):
                 i = i.tolist()
             return i
 
-        return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
+        return Index(
+            lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
 
     if kind == 'special':
         result = indexes[0]
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -20,7 +20,7 @@
 
 def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
            keys=None, levels=None, names=None, verify_integrity=False,
-           copy=True):
+           sort=False, copy=True):
     """
     Concatenate pandas objects along a particular axis with optional set logic
     along the other axes.
@@ -60,6 +60,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
     verify_integrity : boolean, default False
         Check whether the new concatenated axis contains duplicates. This can
         be very expensive relative to the actual data concatenation
+    sort : boolean, default False
+        Sort columns if all passed object columns are not the same
     copy : boolean, default True
         If False, do not copy data unnecessarily
 
@@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
                        ignore_index=ignore_index, join=join,
                        keys=keys, levels=levels, names=names,
                        verify_integrity=verify_integrity,
-                       copy=copy)
+                       copy=copy, sort=sort)
     return op.get_result()
 
 
@@ -220,7 +222,8 @@ class _Concatenator(object):
 
     def __init__(self, objs, axis=0, join='outer', join_axes=None,
                  keys=None, levels=None, names=None,
-                 ignore_index=False, verify_integrity=False, copy=True):
+                 ignore_index=False, verify_integrity=False, copy=True,
+                 sort=False):
         if isinstance(objs, (NDFrame, compat.string_types)):
             raise TypeError('first argument must be an iterable of pandas '
                             'objects, you passed an object of type '
@@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
         self.keys = keys
         self.names = names or getattr(keys, 'names', None)
         self.levels = levels
+        self.sort = sort
 
         self.ignore_index = ignore_index
         self.verify_integrity = verify_integrity
@@ -447,7 +451,8 @@ def _get_comb_axis(self, i):
         data_axis = self.objs[0]._get_block_manager_axis(i)
         try:
             return _get_objs_combined_axis(self.objs, axis=data_axis,
-                                           intersect=self.intersect)
+                                           intersect=self.intersect,
+                                           sort=self.sort)
         except IndexError:
             types = [type(x).__name__ for x in self.objs]
             raise TypeError("Cannot concatenate list of {types}"
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -852,8 +852,9 @@ def test_append_dtype_coerce(self):
                                    dt.datetime(2013, 1, 2, 0, 0),
                                    dt.datetime(2013, 1, 3, 0, 0),
                                    dt.datetime(2013, 1, 4, 0, 0)],
-                                  name='start_time')], axis=1)
-        result = df1.append(df2, ignore_index=True)
+                                  name='start_time')],
+                          axis=1, sort=True)
+        result = df1.append(df2, ignore_index=True, sort=True)
         assert_frame_equal(result, expected)
 
     def test_append_missing_column_proper_upcast(self):
@@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self):
         frame1.index = Index(["x", "y", "z"])
         frame2.index = Index(["x", "y", "q"])
 
-        v1 = concat([frame1, frame2], axis=1, ignore_index=True)
+        v1 = concat([frame1, frame2], axis=1,
+                    ignore_index=True, sort=True)
 
         nan = np.nan
         expected = DataFrame([[nan, nan, nan, 4.3],
@@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self):
         # must reindex, #2603
         s = Series(randn(3), index=['c', 'a', 'b'], name='A')
         s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
-        result = concat([s, s2], axis=1)
+        result = concat([s, s2], axis=1, sort=True)
         expected = DataFrame({'A': s, 'B': s2})
         assert_frame_equal(result, expected)
 
@@ -2070,8 +2072,6 @@ def test_concat_order(self):
                 for i in range(100)]
         result = pd.concat(dfs).columns
         expected = dfs[0].columns
-        if PY2:
-            expected = expected.sort_values()
         tm.assert_index_equal(result, expected)
 
     def test_concat_datetime_timezone(self):
@@ -2155,3 +2155,24 @@ def test_concat_empty_and_non_empty_series_regression():
     expected = s1
     result = pd.concat([s1, s2])
     tm.assert_series_equal(result, expected)
+
+
+def test_concat_preserve_column_order_differing_columns():
+    # GH 4588 regression test
+    # for new columns in concat
+    dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]])
+    dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]])
+    result = pd.concat([dfa, dfb])
+    assert result.columns.tolist() == ['C', 'A', 'Z']
+
+
+def test_concat_preserve_column_order_uneven_data():
+    # GH 4588 regression test
+    # add to column, concat with uneven data
+    df = pd.DataFrame()
+    df['b'] = [1, 2, 3]
+    df['c'] = [1, 2, 3]
+    df['a'] = [1, 2, 3]
+    df2 = pd.DataFrame({'a':[4, 5]})
+    df3 = pd.concat([df, df2])
+    assert df3.columns.tolist() == ['b', 'c', 'a']