-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: DataFrame fail to construct when data is list and columns is nested list for MI #32202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 37 commits
7e461a1
1314059
8bcb313
24c3ede
dea38f2
cd9e7ac
e5e912b
e609188
c8ee822
07ffde2
b3f3da0
2f2054c
9176389
ed02384
d3b0c50
a5e0d10
6073ed7
e8f6d67
559b5d6
a452816
3fd6743
2428edb
fe18e50
a5d159b
7516964
86bd699
30a70a7
a6cb139
ed6dc4a
f058d2c
2852579
493ac33
3ecd6b8
851a3e1
9860985
ffc6561
a028c33
5af0f8e
9eda16a
35e92ea
44af738
d0a10e4
1700e5d
8a036e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,11 +3,13 @@ | |
constructors before passing them to a BlockManager. | ||
""" | ||
from collections import abc | ||
from typing import Dict, List, Optional, Tuple, Union | ||
|
||
import numpy as np | ||
import numpy.ma as ma | ||
|
||
from pandas._libs import lib | ||
from pandas._typing import Axis, Dtype, Scalar | ||
|
||
from pandas.core.dtypes.cast import ( | ||
construct_1d_arraylike_from_scalar, | ||
|
@@ -514,29 +516,32 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): | |
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) | ||
|
||
|
||
def _list_to_arrays(data, columns, coerce_float=False, dtype=None): | ||
def _list_to_arrays( | ||
data, columns, coerce_float=False, dtype=None | ||
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: | ||
if len(data) > 0 and isinstance(data[0], tuple): | ||
content = list(lib.to_object_array_tuples(data).T) | ||
else: | ||
# list of lists | ||
content = list(lib.to_object_array(data).T) | ||
# gh-26429 do not raise user-facing AssertionError | ||
try: | ||
result = _convert_object_array( | ||
content, columns, dtype=dtype, coerce_float=coerce_float | ||
) | ||
columns = _validate_or_indexify_columns(content, columns) | ||
result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) | ||
except AssertionError as e: | ||
raise ValueError(e) from e | ||
return result | ||
return result, columns | ||
|
||
|
||
def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): | ||
def _list_of_series_to_arrays( | ||
data, columns, coerce_float=False, dtype=None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can any of the args be typed? coerce_float should be easy There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added types for all input argument |
||
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: | ||
if columns is None: | ||
# We know pass_data is non-empty because data[0] is a Series | ||
pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] | ||
columns = get_objs_combined_axis(pass_data, sort=False) | ||
|
||
indexer_cache = {} | ||
indexer_cache: Dict[int, Scalar] = {} | ||
|
||
aligned_values = [] | ||
for s in data: | ||
|
@@ -556,14 +561,16 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): | |
|
||
if values.dtype == np.object_: | ||
content = list(values.T) | ||
return _convert_object_array( | ||
content, columns, dtype=dtype, coerce_float=coerce_float | ||
) | ||
columns = _validate_or_indexify_columns(content, columns) | ||
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) | ||
return content, columns | ||
else: | ||
return values.T, columns | ||
|
||
|
||
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): | ||
def _list_of_dict_to_arrays( | ||
data, columns, coerce_float=False, dtype=None | ||
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: | ||
""" | ||
Convert list of dicts to numpy arrays | ||
|
||
|
@@ -595,22 +602,82 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): | |
data = [(type(d) is dict) and d or dict(d) for d in data] | ||
|
||
content = list(lib.dicts_to_array(data, list(columns)).T) | ||
return _convert_object_array( | ||
content, columns, dtype=dtype, coerce_float=coerce_float | ||
) | ||
columns = _validate_or_indexify_columns(content, columns) | ||
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) | ||
return content, columns | ||
|
||
|
||
def _convert_object_array(content, columns, coerce_float=False, dtype=None): | ||
def _validate_or_indexify_columns( | ||
content: List, columns: Union[Index, List, None] | ||
) -> Union[Index, List[Axis]]: | ||
"""If columns is None, make numbers as column names; If not None, validate if | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. newline after There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed! |
||
columns are valid in length. | ||
|
||
Parameters | ||
---------- | ||
content: list of processed data records | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does "processed data records" mean here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry for misleading, just because this is not original input data (input data being transformed a bit), so i put that word. I will just use |
||
columns: Iterable or None | ||
|
||
Returns | ||
------- | ||
columns: If columns is Iterable, return as is; If columns is None, assign | ||
positional column index value as columns. | ||
|
||
Raises | ||
------ | ||
1. When content is not composed of list of lists, and if length of columns | ||
is not equal to length of content. | ||
2. When content is list of lists, but length of each sub-list is not equal | ||
3. When content is list of lists, but length of sub-list is not equal to | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we say what kinds of exceptions are raised in these cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed! |
||
length of content | ||
""" | ||
if columns is None: | ||
columns = ibase.default_index(len(content)) | ||
else: | ||
if len(columns) != len(content): # pragma: no cover | ||
|
||
# Add mask for data which is composed of list of lists | ||
is_mi_list = isinstance(columns, list) and all( | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
isinstance(col, list) for col in columns | ||
) | ||
|
||
if not is_mi_list and len(columns) != len(content): # pragma: no cover | ||
# caller's responsibility to check for this... | ||
raise AssertionError( | ||
f"{len(columns)} columns passed, passed data had " | ||
f"{len(content)} columns" | ||
) | ||
elif is_mi_list: | ||
|
||
# check if nested list column, length of each sub-list should be equal | ||
if len({len(col) for col in columns}) > 1: | ||
raise ValueError( | ||
"Length of columns passed for MultiIndex columns is different" | ||
) | ||
|
||
# if columns is not empty and length of sublist is not equal to content | ||
elif columns and len(columns[0]) != len(content): | ||
raise ValueError( | ||
f"{len(columns[0])} columns passed, passed data had " | ||
f"{len(content)} columns" | ||
) | ||
return columns | ||
|
||
|
||
def _convert_object_array( | ||
content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None | ||
) -> List[Scalar]: | ||
"""Internal function ot convert object array. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. newline after """ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed! |
||
|
||
Parameters | ||
---------- | ||
content: list of processed data records | ||
coerce_float: bool, to coerce floats or not, default is False | ||
dtype: np.dtype, default is None | ||
|
||
Returns | ||
------- | ||
arrays: list of converted arrays | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does "arrays" here mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you mean the first |
||
""" | ||
# provide soft conversion of object dtypes | ||
def convert(arr): | ||
if dtype != object and dtype != np.object: | ||
|
@@ -620,7 +687,7 @@ def convert(arr): | |
|
||
arrays = [convert(arr) for arr in content] | ||
|
||
return arrays, columns | ||
return arrays | ||
|
||
|
||
# --------------------------------------------------------------------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1063,6 +1063,32 @@ def test_constructor_list_of_lists(self): | |
result = DataFrame(data) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_constructor_list_like_data_nested_list_column(self): | ||
# GH 32173 | ||
arrays = [list("abcd"), list("cdef")] | ||
result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) | ||
|
||
mi = MultiIndex.from_arrays(arrays) | ||
expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_constructor_wrong_length_nested_list_column(self): | ||
# GH 32173 | ||
arrays = [list("abc"), list("cde")] | ||
|
||
msg = "3 columns passed, passed data had 4" | ||
with pytest.raises(ValueError, match=msg): | ||
DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) | ||
|
||
def test_constructor_inequal_length_nested_list_column(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. inequal -> unequal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. corrected |
||
# GH 32173 | ||
arrays = [list("abcd"), list("cde")] | ||
|
||
msg = "Length of columns passed for MultiIndex columns is different" | ||
with pytest.raises(ValueError, match=msg): | ||
DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) | ||
|
||
def test_constructor_sequence_like(self): | ||
# GH 3783 | ||
# collections.Squence like | ||
|
Uh oh!
There was an error while loading. Please reload this page.