-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: MultiIndex.from_frame #23141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: MultiIndex.from_frame #23141
Changes from 17 commits
79bdecb
fa82618
64b45d6
64c7bb1
3ee676c
fd266f5
4bc8f5b
9d92b70
45595ad
3530cd3
1c22791
cf78780
64c2750
ede030b
190c341
e0df632
78ff5c2
0252db9
d98c8a9
8a1906e
08c120f
8353c3f
9df3c11
6d4915e
b5df7b2
ab3259c
cf95261
63051d7
a75a4a5
8d23df9
c8d696d
7cf82d1
1a282e5
b3c6a90
c760359
bb69314
9e11180
96c6af3
a5236bf
c78f364
14bfea8
6960804
11c5947
904644a
30fe0df
ec60563
8fc6609
9b906c6
e416122
4ef9ec4
4240a1e
9159b2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
# pylint: disable=E1101,E1103,W0232 | ||
import datetime | ||
import warnings | ||
from collections import OrderedDict | ||
from sys import getsizeof | ||
|
||
import numpy as np | ||
|
@@ -1189,11 +1190,15 @@ def to_frame(self, index=True, name=None): | |
else: | ||
idx_names = self.names | ||
|
||
result = DataFrame({(name or level): | ||
self._get_level_values(level) | ||
for name, level in | ||
zip(idx_names, range(len(self.levels)))}, | ||
copy=False) | ||
result = DataFrame( | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
OrderedDict([ | ||
((name or level), self._get_level_values(level)) | ||
for name, level in zip(idx_names, range(len(self.levels))) | ||
]), | ||
copy=False | ||
) | ||
|
||
|
||
if index: | ||
result.index = self | ||
return result | ||
|
@@ -1294,6 +1299,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): | |
MultiIndex.from_tuples : Convert list of tuples to MultiIndex | ||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||
of iterables | ||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||
""" | ||
if not is_list_like(arrays): | ||
raise TypeError("Input must be a list / sequence of array-likes.") | ||
|
@@ -1343,6 +1349,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): | |
MultiIndex.from_arrays : Convert list of arrays to MultiIndex | ||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||
of iterables | ||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||
""" | ||
if not is_list_like(tuples): | ||
raise TypeError('Input must be a list / sequence of tuple-likes.') | ||
|
@@ -1399,6 +1406,7 @@ def from_product(cls, iterables, sortorder=None, names=None): | |
-------- | ||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex | ||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex | ||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||
""" | ||
from pandas.core.arrays.categorical import _factorize_from_iterables | ||
from pandas.core.reshape.util import cartesian_product | ||
|
@@ -1412,6 +1420,77 @@ def from_product(cls, iterables, sortorder=None, names=None): | |
labels = cartesian_product(labels) | ||
return MultiIndex(levels, labels, sortorder=sortorder, names=names) | ||
|
||
@classmethod | ||
def from_frame(cls, df, squeeze=True, names=None): | ||
""" | ||
Make a MultiIndex from a DataFrame. | ||
|
||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
DataFrame to be converted to MultiIndex. | ||
squeeze : bool, default True | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
If df is a single column, squeeze MultiIndex to be a regular Index. | ||
names : list / sequence / callable, optonal | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
If no names provided, use column names, or tuple of column names if | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
the columns is a MultiIndex. If sequence, overwrite names with the | ||
given sequence. If callable, pass each column name or tuples of | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really sure of the difference of these, can you show what the rationale for all of these options? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The callable option was mostly for cases where the frame used to construct the mi is itself multiindexed on the columns. Example below:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would be an uncommon occurrence. Would it make more sense to just not provide the callable option? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's remove the callable option for now |
||
names to the callable. | ||
|
||
Returns | ||
------- | ||
MultiIndex or Index | ||
The MultiIndex representation of the given DataFrame. Returns an | ||
Index if the DataFrame is single column and squeeze is True. | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'], | ||
... [1, 'jolly'], [2, 'joy'], [2, 'joy']], | ||
... columns=['number', 'mood']) | ||
>>> df | ||
number mood | ||
0 0 happy | ||
1 0 jolly | ||
2 1 happy | ||
3 1 jolly | ||
4 2 joy | ||
5 2 joy | ||
>>> pd.MultiIndex.from_frame(df) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], | ||
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
names=['number', 'mood']) | ||
|
||
See Also | ||
-------- | ||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex | ||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex | ||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||
of iterables | ||
""" | ||
from pandas import DataFrame | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not isinstance(df, DataFrame): | ||
raise TypeError("Input must be a DataFrame") | ||
|
||
# Get MultiIndex names | ||
if names is None: | ||
names = list(df) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
if callable(names): | ||
names = [names(x) for x in list(df)] | ||
else: | ||
if not is_list_like(names): | ||
raise TypeError("'names' must be a list / sequence " | ||
"of column names, or a callable.") | ||
|
||
if len(names) != len(list(df)): | ||
raise ValueError("'names' should have same length as " | ||
"number of columns in df.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggested in my previous comment that all checks on
sufficient?! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, removed the redundant code. Thanks. |
||
|
||
# This way will preserve dtype of columns | ||
mi = cls.from_arrays([df[x] for x in df], names=names) | ||
return mi.squeeze() if squeeze else mi | ||
|
||
def _sort_levels_monotonic(self): | ||
""" | ||
.. versionadded:: 0.20.0 | ||
|
@@ -1474,6 +1553,30 @@ def _sort_levels_monotonic(self): | |
names=self.names, sortorder=self.sortorder, | ||
verify_integrity=False) | ||
|
||
def squeeze(self): | ||
""" | ||
Squeeze a single level MultiIndex to be a regular Index instance. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need to make this a public method. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed squeeze -> _squeeze There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am re-thinking if this should be public, see #22866 |
||
Returns | ||
------- | ||
Index or MultiIndex | ||
Returns Index equivalent of single level MultiIndex. Returns | ||
copy of MultiIndex if multilevel. | ||
|
||
Examples | ||
-------- | ||
>>> mi = pd.MultiIndex.from_tuples([('a',), ('b',), ('c',)]) | ||
>>> mi | ||
MultiIndex(levels=[['a', 'b', 'c']], | ||
labels=[[0, 1, 2]]) | ||
>>> mi.squeeze() | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Index(['a', 'b', 'c'], dtype='object') | ||
""" | ||
if len(self.levels) == 1: | ||
return self.levels[0][self.labels[0]] | ||
else: | ||
return self.copy() | ||
|
||
def remove_unused_levels(self): | ||
""" | ||
Create a new MultiIndex from the current that removes | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -472,3 +472,77 @@ def test_from_tuples_with_tuple_label(): | |
idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) | ||
result = pd.DataFrame([2, 3], columns=['c'], index=idx) | ||
tm.assert_frame_equal(expected, result) | ||
|
||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def test_from_frame(): | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=['L1', 'L2']) | ||
expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), | ||
('b', 'a'), ('b', 'b')], | ||
names=['L1', 'L2']) | ||
result = pd.MultiIndex.from_frame(df) | ||
tm.assert_index_equal(expected, result) | ||
|
||
|
||
@pytest.mark.parametrize('squeeze,input_type,expected', [ | ||
(True, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), | ||
('b', 'a'), ('b', 'b')], | ||
names=['L1', 'L2'])), | ||
(True, 'single', pd.Index(['a', 'a', 'b', 'b'], name='L1')), | ||
(False, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), | ||
('b', 'a'), ('b', 'b')], | ||
names=['L1', 'L2'])), | ||
(False, 'single', pd.MultiIndex.from_tuples([('a',), ('a',), | ||
('b',), ('b',)], | ||
names=['L1'])) | ||
]) | ||
def test_from_frame_squeeze(squeeze, input_type, expected): | ||
if input_type == 'multi': | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=['L1', 'L2']) | ||
elif input_type == 'single': | ||
df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) | ||
|
||
result = pd.MultiIndex.from_frame(df, squeeze=squeeze) | ||
tm.assert_index_equal(expected, result) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def test_from_frame_non_frame(): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed, thanks. |
||
pd.MultiIndex.from_frame([1, 2, 3, 4]) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def test_from_frame_dtype_fidelity(): | ||
df = pd.DataFrame({ | ||
'dates': pd.date_range('19910905', periods=6), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also test dates with timezones? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, in both to_frame and from_frame. |
||
'a': [1, 1, 1, 2, 2, 2], | ||
'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), | ||
'c': ['x', 'x', 'y', 'z', 'x', 'y'] | ||
}) | ||
original_dtypes = df.dtypes.to_dict() | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
mi = pd.MultiIndex.from_frame(df) | ||
mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} | ||
assert original_dtypes == mi_dtypes | ||
|
||
|
||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def test_from_frame_names_as_list(): | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=['L1', 'L2']) | ||
mi = pd.MultiIndex.from_frame(df, names=['a', 'b']) | ||
assert mi.names == ['a', 'b'] | ||
|
||
|
||
def test_from_frame_names_as_callable(): | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=pd.MultiIndex.from_tuples([('L1', 'x'), | ||
('L2', 'y')])) | ||
mi = pd.MultiIndex.from_frame(df, names=lambda x: '_'.join(x)) | ||
assert mi.names == ['L1_x', 'L2_y'] | ||
|
||
|
||
def test_from_frame_names_bad_input(): | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=['L1', 'L2']) | ||
with tm.assert_raises_regex(TypeError, "names' must be a list / sequence " | ||
"of column names, or a callable."): | ||
pd.MultiIndex.from_frame(df, names='bad') |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,6 +82,20 @@ def test_to_frame(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_to_frame_dtype_fidelity(): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
mi = pd.MultiIndex.from_arrays([ | ||
pd.date_range('19910905', periods=6), | ||
[1, 1, 1, 2, 2, 2], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this a repeated test of the above, if so, then not necessary here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was at the suggestion of @TomAugspurger |
||
pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), | ||
['x', 'x', 'y', 'z', 'x', 'y'] | ||
], names=['dates', 'a', 'b', 'c']) | ||
original_dtypes = {name: mi.levels[i].dtype | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for i, name in enumerate(mi.names)} | ||
df = mi.to_frame() | ||
df_dtypes = df.dtypes.to_dict() | ||
assert original_dtypes == df_dtypes | ||
|
||
|
||
def test_to_hierarchical(): | ||
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( | ||
2, 'two')]) | ||
|
@@ -169,3 +183,20 @@ def test_to_series_with_arguments(idx): | |
assert s.values is not idx.values | ||
assert s.index is not idx | ||
assert s.name != idx.name | ||
|
||
|
||
def test_squeeze_single_level(): | ||
mi = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], | ||
names=['L1']) | ||
expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') | ||
result = mi.squeeze() | ||
tm.assert_index_equal(expected, result) | ||
|
||
|
||
def test_squeeze_multi_level(): | ||
mi = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('b', 'a'), | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
('b', 'b')], | ||
names=['L1', 'L2']) | ||
expected = mi.copy() | ||
result = mi.squeeze() | ||
tm.assert_index_equal(expected, result) |
Uh oh!
There was an error while loading. Please reload this page.