-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Fix by
in DataFrame.plot.hist and DataFrame.plot.box
#28373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 174 commits
7e461a1
1314059
8bcb313
e36592c
b2f45a6
8b6e00a
dc0c2ec
d803938
33dd762
d59d642
66eb06c
ea267ad
8095224
31decc1
e4bdbd0
4033159
57a3bdf
8060223
d666334
3216d59
45f4b7f
2b0785b
d79dba3
eca597b
321fbd2
a7b9ae5
d2d13fd
5abedb6
d73115a
d7998bb
1bbf7ea
a279f45
2b793ea
04de066
4adc324
d0103a4
f94dbb4
0415cb0
525200b
1ab4310
c005880
a1fabc5
70453f1
b6579a5
e99f3dc
99d6d67
8e2fcf6
d02f4ac
947189c
6b5203d
27d0d21
48ff521
f39d948
5d1705c
90471aa
46a8031
57a96e6
29127f0
61bb97f
62fb9e6
638174b
02de005
5adb25d
7051432
adbde9f
5dfad18
abd10f3
c20d81a
07112c0
fb0b87c
a6a8e57
7f77f48
c09bb19
a120d27
f87afee
82711ee
60f7298
071488b
f2a0210
bb07e15
867094a
b0f06b2
111e89c
6472053
83ec868
d6c8566
6a0ac8d
49d0791
2bfbe78
c5d7518
03356ce
7abc47d
db832b4
be99a97
9ae5987
10c2ad1
ce8cfd4
627cc02
4bfbf03
ee8972d
12ff785
163f920
0839be2
142ee53
ef65137
d793703
2710cf2
f76d2cb
a5ecbd7
439be51
5fd420e
4b4832f
7425dff
8ab4b90
b06e454
79294ed
9523bb9
cd59370
050ba95
add406f
bb22c53
25214e6
aaa5c95
77e46f4
9de9c61
af68d2e
b75015a
b90303d
898fa9b
2ac32f5
f7bcdb7
aeb32e5
dc17959
4aee3e0
4eb466f
5160224
e2de0d3
b2b33ac
1199a93
c4a5842
6556414
826f277
891dc55
4c4a158
ea7e5b1
006588e
4f0a1dc
e1579e2
f2c141f
5f96abd
06483af
08f534d
f1c3a1f
e6e96d3
52e47f1
bc2f282
a43d3bb
ceeb3c5
4fea841
3ea2603
9f48139
b1094e3
97bde59
444a964
b66dad0
982f562
c76ad67
2c1aa33
6896546
3c54302
2d20178
a169dfd
d0b56ff
dec313c
143f286
283286f
f2a0736
f1aeee0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
from matplotlib.artist import Artist | ||
import numpy as np | ||
|
||
from pandas._typing import IndexLabel | ||
from pandas.errors import AbstractMethodError | ||
from pandas.util._decorators import cache_readonly | ||
|
||
|
@@ -38,10 +39,12 @@ | |
) | ||
|
||
import pandas.core.common as com | ||
from pandas.core.frame import DataFrame | ||
|
||
from pandas.io.formats.printing import pprint_thing | ||
from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 | ||
from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters | ||
from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by | ||
from pandas.plotting._matplotlib.style import get_standard_colors | ||
from pandas.plotting._matplotlib.timeseries import ( | ||
decorate_axes, | ||
|
@@ -99,7 +102,7 @@ def __init__( | |
self, | ||
data, | ||
kind=None, | ||
by=None, | ||
by: IndexLabel | None = None, | ||
subplots=False, | ||
sharex=None, | ||
sharey=False, | ||
|
@@ -124,13 +127,42 @@ def __init__( | |
table=False, | ||
layout=None, | ||
include_bool=False, | ||
column: IndexLabel | None = None, | ||
**kwds, | ||
): | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
self.data = data | ||
self.by = by | ||
|
||
# if users assign an empty list or tuple, treat them as None | ||
# then no group-by will be conducted. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is by allowed to be an empty list/tuple? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are right. I initially thought that is the behaviour of current I also changed the inline comment above this line and align with the change, and also add tests to reflect the changes. |
||
if by in ([], ()): | ||
by = None | ||
self.by = com.maybe_make_list(by) | ||
|
||
# Assign the rest of columns into self.columns if by is explicitly defined | ||
# while column is not, only need `columns` in hist/box plot when it's DF | ||
# TODO: Might deprecate `column` argument in future PR (#28373) | ||
if isinstance(data, DataFrame): | ||
if column: | ||
self.columns = com.maybe_make_list(column) | ||
else: | ||
if self.by is None: | ||
self.columns = [ | ||
col for col in data.columns if is_numeric_dtype(data[col]) | ||
] | ||
else: | ||
self.columns = [ | ||
col | ||
for col in data.columns | ||
if col not in self.by and is_numeric_dtype(data[col]) | ||
] | ||
|
||
# For `hist` plot, need to get grouped original data before `self.data` is | ||
# updated later | ||
if self.by is not None and self._kind == "hist": | ||
self._grouped = data.groupby(self.by) | ||
|
||
self.kind = kind | ||
|
||
|
@@ -139,7 +171,9 @@ def __init__( | |
self.subplots = subplots | ||
|
||
if sharex is None: | ||
if ax is None: | ||
|
||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# if by is defined, subplots are used and sharex should be False | ||
if ax is None and by is None: | ||
self.sharex = True | ||
else: | ||
# if we get an axis, the users should do the visibility | ||
|
@@ -273,8 +307,15 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): | |
|
||
@property | ||
def nseries(self) -> int: | ||
|
||
# When `by` is explicitly assigned, grouped data size will be defined, and | ||
# this will determine number of subplots to have, aka `self.nseries` | ||
if self.data.ndim == 1: | ||
return 1 | ||
elif self.by is not None and self._kind == "hist": | ||
return len(self._grouped) | ||
elif self.by is not None and self._kind == "box": | ||
return len(self.columns) | ||
else: | ||
return self.data.shape[1] | ||
|
||
|
@@ -420,6 +461,14 @@ def _compute_plot_data(self): | |
if label is None and data.name is None: | ||
label = "None" | ||
data = data.to_frame(name=label) | ||
elif self._kind in ("hist", "box"): | ||
cols = self.columns if self.by is None else self.columns + self.by | ||
data = data.loc[:, cols] | ||
|
||
# GH15079 reconstruct data if by is defined | ||
if self.by is not None: | ||
self.subplots = True | ||
data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns) | ||
|
||
# GH16953, _convert is needed as fallback, for ``Series`` | ||
# with ``dtype == object`` | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
from __future__ import annotations | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import ( | ||
Dict, | ||
IndexLabel, | ||
) | ||
|
||
from pandas.core.dtypes.missing import remove_na_arraylike | ||
|
||
from pandas import ( | ||
DataFrame, | ||
MultiIndex, | ||
Series, | ||
concat, | ||
) | ||
|
||
|
||
def create_iter_data_given_by( | ||
data: DataFrame, kind: str = "hist" | ||
) -> Dict[str, DataFrame | Series]: | ||
""" | ||
Create data for iteration given `by` is assigned or not, and it is only | ||
used in both hist and boxplot. | ||
|
||
If `by` is assigned, return a dictionary of DataFrames in which the key of | ||
dictionary is the values in groups. | ||
If `by` is not assigned, return input as is, and this preserves current | ||
status of iter_data. | ||
|
||
Parameters | ||
---------- | ||
data : reformatted grouped data from `_compute_plot_data` method. | ||
kind : str, plot kind. This function is only used for `hist` and `box` plots. | ||
|
||
Returns | ||
------- | ||
iter_data : DataFrame or Dictionary of DataFrames | ||
|
||
Examples | ||
-------- | ||
If `by` is assigned: | ||
|
||
>>> import numpy as np | ||
>>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] | ||
>>> mi = MultiIndex.from_tuples(tuples) | ||
>>> value = [[1, 3, np.nan, np.nan], | ||
... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] | ||
>>> data = DataFrame(value, columns=mi) | ||
>>> create_iter_data_given_by(data) | ||
{'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), | ||
'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} | ||
""" | ||
|
||
# For `hist` plot, before transformation, the values in level 0 are values | ||
# in groups and subplot titles, and later used for column subselection and | ||
# iteration; For `box` plot, values in level 1 are column names to show, | ||
# and are used for iteration and as subplots titles. | ||
if kind == "hist": | ||
level = 0 | ||
elif kind == "box": | ||
level = 1 | ||
datapythonista marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
raise ValueError( | ||
f"create_iter_data_given_by can only be used with " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this hit in a test? this looks internal yes? or is it user facing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, this is just an internal function, and only used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The point of the comment was if we should add a test, to make sure the exception is raised as expected. Probably not needed, but feel free to add. But better don't remove this. If this is ever called for the wrong plot, better to have this clear message, that one about level not being defined. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree! Will change later and maybe add a small test for this also |
||
f"kind 'hist' and 'box' plots, but used with '{kind}'" | ||
) | ||
|
||
# Select sub-columns based on the value of level of MI, and if `by` is | ||
# assigned, data must be a MI DataFrame | ||
assert isinstance(data.columns, MultiIndex) | ||
return { | ||
col: data.loc[:, data.columns.get_level_values(level) == col] | ||
for col in data.columns.levels[level] | ||
} | ||
|
||
|
||
def reconstruct_data_with_by( | ||
data: DataFrame, by: IndexLabel, cols: IndexLabel | ||
) -> DataFrame: | ||
""" | ||
Internal function to group data, and reassign multiindex column names onto the | ||
result in order to let grouped data be used in _compute_plot_data method. | ||
|
||
Parameters | ||
---------- | ||
data : Original DataFrame to plot | ||
by : grouped `by` parameter selected by users | ||
cols : columns of data set (excluding columns used in `by`) | ||
|
||
Returns | ||
------- | ||
Output is the reconstructed DataFrame with MultiIndex columns. The first level | ||
of MI is unique values of groups, and second level of MI is the columns | ||
selected by users. | ||
|
||
Examples | ||
-------- | ||
>>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} | ||
>>> df = DataFrame(d) | ||
>>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) | ||
h1 h2 | ||
a b a b | ||
0 1 3 NaN NaN | ||
1 3 4 NaN NaN | ||
2 NaN NaN 5 6 | ||
""" | ||
grouped = data.groupby(by) | ||
|
||
data_list = [] | ||
for key, group in grouped: | ||
columns = MultiIndex.from_product([[key], cols]) | ||
sub_group = group[cols] | ||
sub_group.columns = columns | ||
data_list.append(sub_group) | ||
|
||
data = concat(data_list, axis=1) | ||
return data | ||
|
||
|
||
def reformat_hist_y_given_by( | ||
y: Series | np.ndarray, by: IndexLabel | None | ||
) -> Series | np.ndarray: | ||
"""Internal function to reformat y given `by` is applied or not for hist plot. | ||
|
||
If by is None, input y is 1-d with NaN removed; and if by is not None, groupby | ||
will take place and input y is multi-dimensional array. | ||
""" | ||
if by is not None and len(y.shape) > 1: | ||
return np.array([remove_na_arraylike(col) for col in y.T]).T | ||
return remove_na_arraylike(y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
was by allowed before?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we do allow
by
, but don't do anything on that. i will also change toversionchanged