Skip to content

Commit 913723b

Browse files
committed
Stop concat from attempting to sort mismatched columns by default
Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Close #4588
1 parent 6d610a4 commit 913723b

File tree

6 files changed

+57
-24
lines changed

6 files changed

+57
-24
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,7 @@ Reshaping
11601160
- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
11611161
- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
11621162
- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
1163+
- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`)
11631164

11641165
Other
11651166
^^^^^

pandas/_libs/lib.pyx

+6-5
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):
157157

158158
@cython.wraparound(False)
159159
@cython.boundscheck(False)
160-
def fast_unique_multiple_list(list lists):
160+
def fast_unique_multiple_list(list lists, bint sort=True):
161161
cdef:
162162
list buf
163163
Py_ssize_t k = len(lists)
@@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
174174
if val not in table:
175175
table[val] = stub
176176
uniques.append(val)
177-
try:
178-
uniques.sort()
179-
except Exception:
180-
pass
177+
if sort:
178+
try:
179+
uniques.sort()
180+
except Exception:
181+
pass
181182

182183
return uniques
183184

pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -5982,7 +5982,8 @@ def infer(x):
59825982
# ----------------------------------------------------------------------
59835983
# Merging / joining methods
59845984

5985-
def append(self, other, ignore_index=False, verify_integrity=False):
5985+
def append(self, other, ignore_index=False,
5986+
verify_integrity=False, sort=False):
59865987
"""
59875988
Append rows of `other` to the end of this frame, returning a new
59885989
object. Columns not in this frame are added as new columns.
@@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
59955996
If True, do not use the index labels.
59965997
verify_integrity : boolean, default False
59975998
If True, raise ValueError on creating index with duplicates.
5999+
sort: boolean, default False
6000+
Sort columns if given object doesn't have the same columns
59986001
59996002
Returns
60006003
-------
@@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
61036106
else:
61046107
to_concat = [self, other]
61056108
return concat(to_concat, ignore_index=ignore_index,
6106-
verify_integrity=verify_integrity)
6109+
verify_integrity=verify_integrity,
6110+
sort=sort)
61076111

61086112
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
61096113
sort=False):

pandas/core/indexes/api.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,17 @@
3131
'_all_indexes_same']
3232

3333

34-
def _get_objs_combined_axis(objs, intersect=False, axis=0):
34+
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
3535
# Extract combined index: return intersection or union (depending on the
3636
# value of "intersect") of indexes on given axis, or None if all objects
3737
# lack indexes (e.g. they are numpy arrays)
3838
obs_idxes = [obj._get_axis(axis) for obj in objs
3939
if hasattr(obj, '_get_axis')]
4040
if obs_idxes:
41-
return _get_combined_index(obs_idxes, intersect=intersect)
41+
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
4242

4343

44-
def _get_combined_index(indexes, intersect=False):
44+
def _get_combined_index(indexes, intersect=False, sort=True):
4545
# TODO: handle index names!
4646
indexes = com._get_distinct_objs(indexes)
4747
if len(indexes) == 0:
@@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False):
5353
for other in indexes[1:]:
5454
index = index.intersection(other)
5555
return index
56-
union = _union_indexes(indexes)
56+
union = _union_indexes(indexes, sort=sort)
5757
return _ensure_index(union)
5858

5959

60-
def _union_indexes(indexes):
60+
def _union_indexes(indexes, sort=True):
6161
if len(indexes) == 0:
6262
raise AssertionError('Must have at least 1 Index to union')
6363
if len(indexes) == 1:
@@ -74,7 +74,8 @@ def conv(i):
7474
i = i.tolist()
7575
return i
7676

77-
return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
77+
return Index(
78+
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
7879

7980
if kind == 'special':
8081
result = indexes[0]

pandas/core/reshape/concat.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
2222
keys=None, levels=None, names=None, verify_integrity=False,
23-
copy=True):
23+
sort=False, copy=True):
2424
"""
2525
Concatenate pandas objects along a particular axis with optional set logic
2626
along the other axes.
@@ -60,6 +60,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
6060
verify_integrity : boolean, default False
6161
Check whether the new concatenated axis contains duplicates. This can
6262
be very expensive relative to the actual data concatenation
63+
sort : boolean, default False
64+
Sort columns if all passed object columns are not the same
6365
copy : boolean, default True
6466
If False, do not copy data unnecessarily
6567
@@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
209211
ignore_index=ignore_index, join=join,
210212
keys=keys, levels=levels, names=names,
211213
verify_integrity=verify_integrity,
212-
copy=copy)
214+
copy=copy, sort=sort)
213215
return op.get_result()
214216

215217

@@ -220,7 +222,8 @@ class _Concatenator(object):
220222

221223
def __init__(self, objs, axis=0, join='outer', join_axes=None,
222224
keys=None, levels=None, names=None,
223-
ignore_index=False, verify_integrity=False, copy=True):
225+
ignore_index=False, verify_integrity=False, copy=True,
226+
sort=False):
224227
if isinstance(objs, (NDFrame, compat.string_types)):
225228
raise TypeError('first argument must be an iterable of pandas '
226229
'objects, you passed an object of type '
@@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
355358
self.keys = keys
356359
self.names = names or getattr(keys, 'names', None)
357360
self.levels = levels
361+
self.sort = sort
358362

359363
self.ignore_index = ignore_index
360364
self.verify_integrity = verify_integrity
@@ -447,7 +451,8 @@ def _get_comb_axis(self, i):
447451
data_axis = self.objs[0]._get_block_manager_axis(i)
448452
try:
449453
return _get_objs_combined_axis(self.objs, axis=data_axis,
450-
intersect=self.intersect)
454+
intersect=self.intersect,
455+
sort=self.sort)
451456
except IndexError:
452457
types = [type(x).__name__ for x in self.objs]
453458
raise TypeError("Cannot concatenate list of {types}"

pandas/tests/reshape/test_concat.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from numpy.random import randn
66

77
from datetime import datetime
8-
from pandas.compat import StringIO, iteritems, PY2
8+
from pandas.compat import StringIO, iteritems
99
import pandas as pd
1010
from pandas import (DataFrame, concat,
1111
read_csv, isna, Series, date_range,
@@ -852,8 +852,9 @@ def test_append_dtype_coerce(self):
852852
dt.datetime(2013, 1, 2, 0, 0),
853853
dt.datetime(2013, 1, 3, 0, 0),
854854
dt.datetime(2013, 1, 4, 0, 0)],
855-
name='start_time')], axis=1)
856-
result = df1.append(df2, ignore_index=True)
855+
name='start_time')],
856+
axis=1, sort=True)
857+
result = df1.append(df2, ignore_index=True, sort=True)
857858
assert_frame_equal(result, expected)
858859

859860
def test_append_missing_column_proper_upcast(self):
@@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self):
10111012
frame1.index = Index(["x", "y", "z"])
10121013
frame2.index = Index(["x", "y", "q"])
10131014

1014-
v1 = concat([frame1, frame2], axis=1, ignore_index=True)
1015+
v1 = concat([frame1, frame2], axis=1,
1016+
ignore_index=True, sort=True)
10151017

10161018
nan = np.nan
10171019
expected = DataFrame([[nan, nan, nan, 4.3],
@@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self):
14631465
# must reindex, #2603
14641466
s = Series(randn(3), index=['c', 'a', 'b'], name='A')
14651467
s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
1466-
result = concat([s, s2], axis=1)
1468+
result = concat([s, s2], axis=1, sort=True)
14671469
expected = DataFrame({'A': s, 'B': s2})
14681470
assert_frame_equal(result, expected)
14691471

@@ -2070,8 +2072,6 @@ def test_concat_order(self):
20702072
for i in range(100)]
20712073
result = pd.concat(dfs).columns
20722074
expected = dfs[0].columns
2073-
if PY2:
2074-
expected = expected.sort_values()
20752075
tm.assert_index_equal(result, expected)
20762076

20772077
def test_concat_datetime_timezone(self):
@@ -2155,3 +2155,24 @@ def test_concat_empty_and_non_empty_series_regression():
21552155
expected = s1
21562156
result = pd.concat([s1, s2])
21572157
tm.assert_series_equal(result, expected)
2158+
2159+
2160+
def test_concat_preserve_column_order_differing_columns():
2161+
# GH 4588 regression test
2162+
# for new columns in concat
2163+
dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]])
2164+
dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]])
2165+
result = pd.concat([dfa, dfb])
2166+
assert result.columns.tolist() == ['C', 'A', 'Z']
2167+
2168+
2169+
def test_concat_preserve_column_order_uneven_data():
2170+
# GH 4588 regression test
2171+
# add to column, concat with uneven data
2172+
df = pd.DataFrame()
2173+
df['b'] = [1, 2, 3]
2174+
df['c'] = [1, 2, 3]
2175+
df['a'] = [1, 2, 3]
2176+
df2 = pd.DataFrame({'a': [4, 5]})
2177+
df3 = pd.concat([df, df2])
2178+
assert df3.columns.tolist() == ['b', 'c', 'a']

0 commit comments

Comments
 (0)