Skip to content

Allow merging on object / non-object column #21681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 3, 2019
9 changes: 8 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
is_bool_dtype,
is_list_like,
is_datetimelike,
is_object_dtype,
_ensure_int64,
_ensure_float64,
_ensure_object,
Expand Down Expand Up @@ -946,11 +947,14 @@ def _maybe_coerce_merge_keys(self):
"you should use pd.concat".format(lk_dtype=lk.dtype,
rk_dtype=rk.dtype))

coerce_to_object = False
if is_object_dtype(lk) or is_object_dtype(rk):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this actually necessary here? e.g. it would hit the else clause if this is true as well.

Then can just do the conversions in the else. (e.g. stuff on line 1011)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without this we hit

    956             elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk))
    957                     and not is_numeric_dtype(rk)):
--> 958                 raise ValueError(msg)

and raise.

coerce_to_object = True
# if we are numeric, then allow differing
# kinds to proceed, eg. int64 and int8, int and float
# further if we are object, but we infer to
# the same, then proceed
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
elif is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
pass

Expand Down Expand Up @@ -1001,6 +1005,9 @@ def _maybe_coerce_merge_keys(self):
# columns, and end up trying to merge
# incompatible dtypes. See GH 16900.
else:
coerce_to_object = True

if coerce_to_object:
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
Expand Down
32 changes: 13 additions & 19 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1434,22 +1434,22 @@ def test_different(self, right_vals):
# GH 9780
# We allow merging on object and categorical cols and cast
# categorical cols to object
if (is_categorical_dtype(right['A'].dtype) or
is_object_dtype(right['A'].dtype)):
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)
# if (is_categorical_dtype(right['A'].dtype) or
# is_object_dtype(right['A'].dtype)):
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

# GH 9780
# We raise for merging on object col and int/float col and
# merging on categorical col and int/float col
else:
msg = ("You are trying to merge on "
"{lk_dtype} and {rk_dtype} columns. "
"If you wish to proceed you should use "
"pd.concat".format(lk_dtype=left['A'].dtype,
rk_dtype=right['A'].dtype))
with tm.assert_raises_regex(ValueError, msg):
pd.merge(left, right, on='A')
# else:
# msg = ("You are trying to merge on "
# "{lk_dtype} and {rk_dtype} columns. "
# "If you wish to proceed you should use "
# "pd.concat".format(lk_dtype=left['A'].dtype,
# rk_dtype=right['A'].dtype))
# with tm.assert_raises_regex(ValueError, msg):
# pd.merge(left, right, on='A')

@pytest.mark.parametrize('d1', [np.int64, np.int32,
np.int16, np.int8, np.uint8])
Expand Down Expand Up @@ -1548,19 +1548,13 @@ def test_merge_incompat_infer_boolean_object(self):
assert_frame_equal(result, expected)

@pytest.mark.parametrize('df1_vals, df2_vals', [
([0, 1, 2], ["0", "1", "2"]),
([0.0, 1.0, 2.0], ["0", "1", "2"]),
([0, 1, 2], [u"0", u"1", u"2"]),
(pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
'2011-01-02']),
(pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
(pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
(pd.date_range('20130101', periods=3),
pd.date_range('20130101', periods=3, tz='US/Eastern')),
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
# TODO ([0, 1], pd.Series([False, True], dtype=bool)),
([0, 1], pd.Series([False, True], dtype=object))
# TODO ([0, 1], pd.Series([False, True]))
])
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
# GH 9780, GH 15800
Expand Down