-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
BUG: Check for duplicate names columns and index in crosstab #28474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a748ccd
c5430ab
5762bb6
989bd5a
672bd4b
1b67514
4f2fa86
fbff182
6071db5
3d9c632
d27233f
478d8dc
1828a36
3e9333d
fe6f7fd
9f4a00a
788ac2b
354a180
d938a96
d60308c
b759073
cf63d4c
94e8175
c6ccab1
142ade0
710ab63
cad2dea
f2741db
f0261e9
2733c17
f1b19a6
d475edd
85e010b
558493f
3ed74d9
e5c6cd9
d16b840
32aa475
820945e
dc1d5c5
0b002da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
List, | ||
Optional, | ||
Sequence, | ||
Set, | ||
Tuple, | ||
Union, | ||
cast, | ||
|
@@ -19,6 +20,7 @@ | |
from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar | ||
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries | ||
|
||
import pandas.core.algorithms as algos | ||
import pandas.core.common as com | ||
from pandas.core.frame import _shared_docs | ||
from pandas.core.groupby import Grouper | ||
|
@@ -580,29 +582,39 @@ def crosstab( | |
b 0 1 0 | ||
c 0 0 0 | ||
""" | ||
if values is None and aggfunc is not None: | ||
raise ValueError("aggfunc cannot be used without values.") | ||
|
||
if values is not None and aggfunc is None: | ||
raise ValueError("values cannot be used without an aggfunc.") | ||
|
||
index = com.maybe_make_list(index) | ||
columns = com.maybe_make_list(columns) | ||
|
||
rownames = _get_names(index, rownames, prefix="row") | ||
colnames = _get_names(columns, colnames, prefix="col") | ||
|
||
common_idx = None | ||
pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] | ||
if pass_objs: | ||
common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) | ||
|
||
data: Dict = {} | ||
data.update(zip(rownames, index)) | ||
data.update(zip(colnames, columns)) | ||
|
||
if values is None and aggfunc is not None: | ||
raise ValueError("aggfunc cannot be used without values.") | ||
rownames = _get_names(index, rownames, prefix="row") | ||
colnames = _get_names(columns, colnames, prefix="col") | ||
|
||
if values is not None and aggfunc is None: | ||
raise ValueError("values cannot be used without an aggfunc.") | ||
# We create our own mapping of row and columns names | ||
cuchoi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# to prevent issues with duplicate columns/row names. GH Issue: #22529 | ||
shared_col_row_names = set(rownames).intersection(set(colnames)) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
row_names_mapper, unique_row_names = _build_names_mapper( | ||
rownames, shared_col_row_names, "row" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't we hae the _build_names take rownames, colnams and 'row/col' ? i think a bit easier to understand (do the intersection inside), OR maybe just return all 4 elements (row_names_mapper, unique_row_names, col_names_mapper, uniques). either way, whatever is simpler. |
||
) | ||
col_names_mapper, unique_col_names = _build_names_mapper( | ||
colnames, shared_col_row_names, "col" | ||
) | ||
|
||
from pandas import DataFrame | ||
|
||
data = { | ||
**dict(zip(unique_row_names, index)), | ||
**dict(zip(unique_col_names, columns)), | ||
} | ||
df = DataFrame(data, index=common_idx) | ||
original_df_cols = df.columns | ||
|
||
|
@@ -615,8 +627,8 @@ def crosstab( | |
|
||
table = df.pivot_table( | ||
["__dummy__"], | ||
index=rownames, | ||
columns=colnames, | ||
index=unique_row_names, | ||
columns=unique_col_names, | ||
margins=margins, | ||
margins_name=margins_name, | ||
dropna=dropna, | ||
|
@@ -635,6 +647,9 @@ def crosstab( | |
table, normalize=normalize, margins=margins, margins_name=margins_name | ||
) | ||
|
||
table = table.rename_axis(index=row_names_mapper, axis=0) | ||
table = table.rename_axis(columns=col_names_mapper, axis=1) | ||
|
||
return table | ||
|
||
|
||
|
@@ -733,3 +748,57 @@ def _get_names(arrs, names, prefix: str = "row"): | |
names = list(names) | ||
|
||
return names | ||
|
||
|
||
def _build_names_mapper( | ||
names: List[str], shared_col_row_names: Set[str], suffix: str | ||
) -> Tuple[Dict[str, str], List[str]]: | ||
""" | ||
Given a list of row or column names, creates a mapper of unique names to | ||
column/row names. | ||
|
||
Parameters | ||
---------- | ||
names : list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of creating uniques names, can we simply map input names -> position (which by definition are unique); still returning a dict and renaming is all ok. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can. My only issue with that if there are any errors when creating the DataFrame (line 618) or calling df.pivot_table (line 628) then the user would see unrecognizable column names for those errors. |
||
Names to be deduplicated. | ||
shared_col_row_names : set or list | ||
Values used both in rows and columns, so need additional deduplication. | ||
suffix : str | ||
Suffix to deduplicate values in shared_col_row_names | ||
|
||
Returns | ||
------- | ||
names_mapper: dict | ||
The keys are the unique names and the values are the original names. | ||
unique_names: list | ||
Unique names in the same order that names came in | ||
""" | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
keys, counts = algos.value_counts_arraylike(names, dropna=False) | ||
names_count = dict(zip(keys, counts)) | ||
|
||
names_mapper = {} | ||
unique_names: List[str] = [] | ||
# We reverse the names so the numbers are in the order given by the user | ||
for name in reversed(names): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
mapped_name = name | ||
name_count = names_count[name] | ||
|
||
# Adds a number if name is duplicated within columns/rows | ||
if name_count > 1: | ||
mapped_name = "{mapped_name}_{name_count}".format( | ||
mapped_name=mapped_name, name_count=name_count | ||
) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
names_count[name] -= 1 | ||
|
||
# Add suffix name if column is shared between column and rows | ||
if name in shared_col_row_names: | ||
mapped_name = "{mapped_name}_{suffix}".format( | ||
mapped_name=mapped_name, suffix=suffix | ||
) | ||
|
||
names_mapper[mapped_name] = name | ||
|
||
# Creates a list of the new names in the original order | ||
unique_names.insert(0, mapped_name) | ||
|
||
return names_mapper, unique_names |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -533,17 +533,51 @@ def test_crosstab_with_numpy_size(self): | |
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_crosstab_dup_index_names(self): | ||
# GH 13279 | ||
s = Series(range(3), name="foo") | ||
def test_crosstab_duplicated_row_and_col_names(self): | ||
# We test that duplicated row or column names do not produce issues | ||
# GH Issue: #22529, GH#13279 | ||
|
||
result = crosstab(s, s) | ||
expected_index = Index(range(3), name="foo") | ||
expected = DataFrame( | ||
np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index | ||
# Same name in both rows and columns | ||
s1 = Series(range(3), name="foo") | ||
s2 = s1 + 1 | ||
expected = crosstab(s1, s2.rename("bar")).rename_axis( | ||
columns={"bar": "foo"}, axis=1 | ||
) | ||
result = crosstab(s1, s2) | ||
tm.assert_frame_equal(result, expected) | ||
assert result.index.names == ["foo"] | ||
assert result.columns.names == ["foo"] | ||
|
||
# Row names duplicated | ||
s1 = Series(range(3), name="foo") | ||
s2 = s1 + 1 | ||
s3 = Series(range(3), name="bar_col") | ||
|
||
expected = crosstab([s1, s2.rename("bar")], s3).rename_axis( | ||
index={"bar": "foo"}, axis=0 | ||
) | ||
result = crosstab([s1, s2], s3) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
assert result.index.names == ["foo", "foo"] | ||
assert result.columns.names == ["bar_col"] | ||
|
||
# Column names duplicated | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you split this into a separate test (and rename both appropriately) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want "duplicated column names" as a separate test? In that case, I think it makes more sense to do three tests:
Makes sense? The other alternative is just renaming the test to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed the test. Let me know if you prefer it split into 3. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes prefer this to be split |
||
s1 = Series(range(3), name="foo") | ||
s2 = s1 + 1 | ||
s3 = Series(range(3), name="bar_row") | ||
|
||
expected = crosstab(s3, [s1, s2.rename("bar")]).rename_axis( | ||
columns={"bar": "foo"}, axis=1 | ||
) | ||
result = crosstab(s3, [s1, s2]) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
assert result.index.names == ["bar_row"] | ||
assert result.columns.names == ["foo", "foo"] | ||
|
||
@pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) | ||
def test_crosstab_tuple_name(self, names): | ||
s1 = Series(range(3), name=names[0]) | ||
|
Uh oh!
There was an error while loading. Please reload this page.