Skip to content

Commit f8484a3

Browse files
committed
Stop concat from attempting to sort mismatched columns by default
Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Close #4588
1 parent 6d610a4 commit f8484a3

File tree

5 files changed

+42
-21
lines changed

5 files changed

+42
-21
lines changed

pandas/_libs/lib.pyx

+6-5
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):
157157

158158
@cython.wraparound(False)
159159
@cython.boundscheck(False)
160-
def fast_unique_multiple_list(list lists):
160+
def fast_unique_multiple_list(list lists, bint sort = True):
161161
cdef:
162162
list buf
163163
Py_ssize_t k = len(lists)
@@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
174174
if val not in table:
175175
table[val] = stub
176176
uniques.append(val)
177-
try:
178-
uniques.sort()
179-
except Exception:
180-
pass
177+
if sort:
178+
try:
179+
uniques.sort()
180+
except Exception:
181+
pass
181182

182183
return uniques
183184

pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -5982,7 +5982,8 @@ def infer(x):
59825982
# ----------------------------------------------------------------------
59835983
# Merging / joining methods
59845984

5985-
def append(self, other, ignore_index=False, verify_integrity=False):
5985+
def append(self, other, ignore_index=False,
5986+
verify_integrity=False, mismatch_sort=False):
59865987
"""
59875988
Append rows of `other` to the end of this frame, returning a new
59885989
object. Columns not in this frame are added as new columns.
@@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
59955996
If True, do not use the index labels.
59965997
verify_integrity : boolean, default False
59975998
If True, raise ValueError on creating index with duplicates.
5999+
mismatch_sort: boolean, default False
6000+
Sort columns if given object doesn't have the same columns
59986001
59996002
Returns
60006003
-------
@@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
61036106
else:
61046107
to_concat = [self, other]
61056108
return concat(to_concat, ignore_index=ignore_index,
6106-
verify_integrity=verify_integrity)
6109+
verify_integrity=verify_integrity,
6110+
mismatch_sort=mismatch_sort)
61076111

61086112
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
61096113
sort=False):

pandas/core/indexes/api.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,17 @@
3131
'_all_indexes_same']
3232

3333

34-
def _get_objs_combined_axis(objs, intersect=False, axis=0):
34+
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
3535
# Extract combined index: return intersection or union (depending on the
3636
# value of "intersect") of indexes on given axis, or None if all objects
3737
# lack indexes (e.g. they are numpy arrays)
3838
obs_idxes = [obj._get_axis(axis) for obj in objs
3939
if hasattr(obj, '_get_axis')]
4040
if obs_idxes:
41-
return _get_combined_index(obs_idxes, intersect=intersect)
41+
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
4242

4343

44-
def _get_combined_index(indexes, intersect=False):
44+
def _get_combined_index(indexes, intersect=False, sort=True):
4545
# TODO: handle index names!
4646
indexes = com._get_distinct_objs(indexes)
4747
if len(indexes) == 0:
@@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False):
5353
for other in indexes[1:]:
5454
index = index.intersection(other)
5555
return index
56-
union = _union_indexes(indexes)
56+
union = _union_indexes(indexes, sort=sort)
5757
return _ensure_index(union)
5858

5959

60-
def _union_indexes(indexes):
60+
def _union_indexes(indexes, sort=True):
6161
if len(indexes) == 0:
6262
raise AssertionError('Must have at least 1 Index to union')
6363
if len(indexes) == 1:
@@ -74,7 +74,8 @@ def conv(i):
7474
i = i.tolist()
7575
return i
7676

77-
return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
77+
return Index(
78+
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
7879

7980
if kind == 'special':
8081
result = indexes[0]

pandas/core/reshape/concat.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
2222
keys=None, levels=None, names=None, verify_integrity=False,
23-
copy=True):
23+
copy=True, mismatch_sort=False):
2424
"""
2525
Concatenate pandas objects along a particular axis with optional set logic
2626
along the other axes.
@@ -62,6 +62,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
6262
be very expensive relative to the actual data concatenation
6363
copy : boolean, default True
6464
If False, do not copy data unnecessarily
65+
mismatch_sort : boolean, default False
66+
Sort columns if all passed object columns are not the same
6567
6668
Returns
6769
-------
@@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
209211
ignore_index=ignore_index, join=join,
210212
keys=keys, levels=levels, names=names,
211213
verify_integrity=verify_integrity,
212-
copy=copy)
214+
copy=copy, sort=mismatch_sort)
213215
return op.get_result()
214216

215217

@@ -220,7 +222,8 @@ class _Concatenator(object):
220222

221223
def __init__(self, objs, axis=0, join='outer', join_axes=None,
222224
keys=None, levels=None, names=None,
223-
ignore_index=False, verify_integrity=False, copy=True):
225+
ignore_index=False, verify_integrity=False, copy=True,
226+
sort=False):
224227
if isinstance(objs, (NDFrame, compat.string_types)):
225228
raise TypeError('first argument must be an iterable of pandas '
226229
'objects, you passed an object of type '
@@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
355358
self.keys = keys
356359
self.names = names or getattr(keys, 'names', None)
357360
self.levels = levels
361+
self.sort = sort
358362

359363
self.ignore_index = ignore_index
360364
self.verify_integrity = verify_integrity
@@ -447,7 +451,8 @@ def _get_comb_axis(self, i):
447451
data_axis = self.objs[0]._get_block_manager_axis(i)
448452
try:
449453
return _get_objs_combined_axis(self.objs, axis=data_axis,
450-
intersect=self.intersect)
454+
intersect=self.intersect,
455+
sort=self.sort)
451456
except IndexError:
452457
types = [type(x).__name__ for x in self.objs]
453458
raise TypeError("Cannot concatenate list of {types}"

pandas/tests/reshape/test_concat.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -852,8 +852,9 @@ def test_append_dtype_coerce(self):
852852
dt.datetime(2013, 1, 2, 0, 0),
853853
dt.datetime(2013, 1, 3, 0, 0),
854854
dt.datetime(2013, 1, 4, 0, 0)],
855-
name='start_time')], axis=1)
856-
result = df1.append(df2, ignore_index=True)
855+
name='start_time')],
856+
axis=1, mismatch_sort=True)
857+
result = df1.append(df2, ignore_index=True, mismatch_sort=True)
857858
assert_frame_equal(result, expected)
858859

859860
def test_append_missing_column_proper_upcast(self):
@@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self):
10111012
frame1.index = Index(["x", "y", "z"])
10121013
frame2.index = Index(["x", "y", "q"])
10131014

1014-
v1 = concat([frame1, frame2], axis=1, ignore_index=True)
1015+
v1 = concat([frame1, frame2], axis=1,
1016+
ignore_index=True, mismatch_sort=True)
10151017

10161018
nan = np.nan
10171019
expected = DataFrame([[nan, nan, nan, 4.3],
@@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self):
14631465
# must reindex, #2603
14641466
s = Series(randn(3), index=['c', 'a', 'b'], name='A')
14651467
s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
1466-
result = concat([s, s2], axis=1)
1468+
result = concat([s, s2], axis=1, mismatch_sort=True)
14671469
expected = DataFrame({'A': s, 'B': s2})
14681470
assert_frame_equal(result, expected)
14691471

@@ -2155,3 +2157,11 @@ def test_concat_empty_and_non_empty_series_regression():
21552157
expected = s1
21562158
result = pd.concat([s1, s2])
21572159
tm.assert_series_equal(result, expected)
2160+
2161+
2162+
def test_concat_preserve_column_order_differing_columns():
2163+
# GH 4588 regression test
2164+
dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]])
2165+
dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]])
2166+
result = pd.concat([dfa, dfb])
2167+
assert result.columns.tolist() == ['C', 'A', 'Z']

0 commit comments

Comments
 (0)