Skip to content

ENH: Fix by in DataFrame.plot.hist and DataFrame.plot.box #28373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 178 commits into from
Jul 12, 2021
Merged
Show file tree
Hide file tree
Changes from 71 commits
Commits
Show all changes
178 commits
Select commit Hold shift + click to select a range
7e461a1
remove \n from docstring
charlesdong1991 Dec 3, 2018
1314059
fix conflicts
charlesdong1991 Jan 19, 2019
8bcb313
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jul 30, 2019
e36592c
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Sep 10, 2019
b2f45a6
fix by in hist
charlesdong1991 Sep 10, 2019
8b6e00a
make plot work
charlesdong1991 Sep 10, 2019
dc0c2ec
add _group_plot function
charlesdong1991 Sep 10, 2019
d803938
check function
charlesdong1991 Sep 10, 2019
33dd762
reformat
charlesdong1991 Sep 10, 2019
d59d642
put import up
charlesdong1991 Sep 10, 2019
66eb06c
add comments
charlesdong1991 Sep 10, 2019
ea267ad
Mimic group plot
charlesdong1991 Sep 10, 2019
8095224
fix import failure
charlesdong1991 Sep 10, 2019
31decc1
reformat
charlesdong1991 Sep 10, 2019
e4bdbd0
fix test
charlesdong1991 Sep 10, 2019
4033159
hacky fix
charlesdong1991 Sep 10, 2019
57a3bdf
fix isrot
charlesdong1991 Sep 10, 2019
8060223
fix tests
charlesdong1991 Sep 11, 2019
d666334
fix import failure
charlesdong1991 Sep 11, 2019
3216d59
fix import error
charlesdong1991 Sep 11, 2019
45f4b7f
Update imports
charlesdong1991 Sep 11, 2019
2b0785b
test imports
charlesdong1991 Sep 11, 2019
d79dba3
new change
charlesdong1991 Sep 12, 2019
eca597b
fix conflict
charlesdong1991 Sep 12, 2019
321fbd2
restore removed line
charlesdong1991 Sep 12, 2019
a7b9ae5
Remove unused line
charlesdong1991 Sep 12, 2019
d2d13fd
Disruptive change
charlesdong1991 Sep 12, 2019
5abedb6
should work this time
charlesdong1991 Sep 13, 2019
d73115a
Add in-code comments
charlesdong1991 Sep 13, 2019
d7998bb
remove print
charlesdong1991 Sep 13, 2019
1bbf7ea
reformat
charlesdong1991 Sep 13, 2019
a279f45
Dropna
charlesdong1991 Sep 13, 2019
2b793ea
Add isna for multi column
charlesdong1991 Sep 14, 2019
04de066
try to remove warning
charlesdong1991 Sep 15, 2019
4adc324
test if removing pd works
charlesdong1991 Sep 16, 2019
d0103a4
revert changes
charlesdong1991 Sep 16, 2019
f94dbb4
try if warning gone
charlesdong1991 Sep 16, 2019
0415cb0
try again
charlesdong1991 Sep 17, 2019
525200b
merge master and fix conflict
charlesdong1991 Dec 23, 2019
1ab4310
merge master and fix conflict
charlesdong1991 Jan 4, 2020
c005880
fix conflict and merge master
charlesdong1991 Jan 4, 2020
a1fabc5
Fix linting error
charlesdong1991 Jan 4, 2020
70453f1
Add test
charlesdong1991 Jan 4, 2020
b6579a5
remove unused code
charlesdong1991 Jan 4, 2020
e99f3dc
add test and make code more robust
charlesdong1991 Jan 4, 2020
99d6d67
remove comment
charlesdong1991 Jan 4, 2020
8e2fcf6
clean the code
charlesdong1991 Jan 4, 2020
d02f4ac
simplify code
charlesdong1991 Jan 4, 2020
947189c
simplify code
charlesdong1991 Jan 4, 2020
6b5203d
fix linting
charlesdong1991 Jan 4, 2020
27d0d21
Add doc for hist
charlesdong1991 Jan 4, 2020
48ff521
revert change
charlesdong1991 Jan 4, 2020
f39d948
fix warning
charlesdong1991 Jan 4, 2020
5d1705c
isort
charlesdong1991 Jan 4, 2020
90471aa
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jan 11, 2020
46a8031
simplify code
charlesdong1991 Jan 11, 2020
57a96e6
simpler python
charlesdong1991 Jan 11, 2020
29127f0
remove unused
charlesdong1991 Jan 11, 2020
61bb97f
restore blank lines
charlesdong1991 Jan 11, 2020
62fb9e6
Add extensive tests
charlesdong1991 Jan 11, 2020
638174b
fix seed
charlesdong1991 Jan 11, 2020
02de005
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jan 15, 2020
5adb25d
code change based on reviews
charlesdong1991 Jan 15, 2020
7051432
fix linting
charlesdong1991 Jan 15, 2020
adbde9f
update doc
charlesdong1991 Jan 15, 2020
5dfad18
merge master and fix conflict
charlesdong1991 Feb 10, 2020
abd10f3
code change based on reviews
charlesdong1991 Feb 10, 2020
c20d81a
fixup
charlesdong1991 Feb 10, 2020
07112c0
fixup
charlesdong1991 Feb 10, 2020
fb0b87c
code change on reviews
charlesdong1991 Feb 11, 2020
a6a8e57
fix isort
charlesdong1991 Feb 11, 2020
7f77f48
short code
charlesdong1991 Feb 12, 2020
c09bb19
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Feb 19, 2020
a120d27
simpler python
charlesdong1991 Feb 23, 2020
f87afee
add inline comment
charlesdong1991 Feb 23, 2020
82711ee
simplier pandas
charlesdong1991 Feb 23, 2020
60f7298
code change on JR review
charlesdong1991 Feb 23, 2020
071488b
fix linting
charlesdong1991 Feb 23, 2020
f2a0210
rebase and fix conflict
charlesdong1991 Feb 27, 2020
bb07e15
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Mar 8, 2020
867094a
code change on reviews
charlesdong1991 Mar 8, 2020
b0f06b2
Add docstring
charlesdong1991 Mar 8, 2020
111e89c
fix typo
charlesdong1991 Mar 15, 2020
6472053
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Mar 15, 2020
83ec868
remove blank
charlesdong1991 Mar 15, 2020
d6c8566
use more meaningful example
charlesdong1991 Mar 15, 2020
6a0ac8d
keep as is
charlesdong1991 Mar 15, 2020
49d0791
remove less useful comment
charlesdong1991 Mar 15, 2020
2bfbe78
change figsize
charlesdong1991 Mar 15, 2020
c5d7518
clean iter_data
charlesdong1991 Apr 4, 2020
03356ce
remove unused docs
charlesdong1991 Apr 4, 2020
7abc47d
cleaner pandas
charlesdong1991 Apr 4, 2020
db832b4
cleaner
charlesdong1991 Apr 4, 2020
be99a97
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Apr 4, 2020
9ae5987
fixup
charlesdong1991 Apr 4, 2020
10c2ad1
rename
charlesdong1991 Apr 4, 2020
ce8cfd4
code change on reviews
charlesdong1991 Apr 7, 2020
627cc02
fixup
charlesdong1991 Apr 7, 2020
4bfbf03
rebase and resolve conflicts
charlesdong1991 Apr 10, 2020
ee8972d
linting
charlesdong1991 Apr 10, 2020
12ff785
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Apr 10, 2020
163f920
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Apr 15, 2020
0839be2
annotation
charlesdong1991 May 1, 2020
142ee53
annotation
charlesdong1991 May 1, 2020
ef65137
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 May 1, 2020
d793703
reverse change on annotation
charlesdong1991 May 5, 2020
2710cf2
fixup
charlesdong1991 May 5, 2020
f76d2cb
remove
charlesdong1991 May 5, 2020
a5ecbd7
add missing annoatation
charlesdong1991 May 11, 2020
439be51
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 May 11, 2020
5fd420e
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 May 11, 2020
4b4832f
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 May 18, 2020
7425dff
code change on WA review
charlesdong1991 May 21, 2020
8ab4b90
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 17, 2020
b06e454
solve mypy
charlesdong1991 Jun 19, 2020
79294ed
fix typo
charlesdong1991 Jun 19, 2020
9523bb9
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 19, 2020
cd59370
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 20, 2020
050ba95
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 20, 2020
add406f
code change on reviews
charlesdong1991 Jun 23, 2020
bb22c53
fix linting
charlesdong1991 Jun 23, 2020
25214e6
rename
charlesdong1991 Jun 27, 2020
aaa5c95
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 27, 2020
77e46f4
modulize reformat_y for hist
charlesdong1991 Jun 27, 2020
9de9c61
better annotation
charlesdong1991 Jun 27, 2020
af68d2e
improve annotation
charlesdong1991 Jun 27, 2020
b75015a
fix linting
charlesdong1991 Jun 27, 2020
b90303d
improve docstring
charlesdong1991 Jun 27, 2020
898fa9b
rebase
charlesdong1991 May 21, 2021
2ac32f5
remove added test file
charlesdong1991 May 21, 2021
f7bcdb7
revert change
charlesdong1991 May 21, 2021
aeb32e5
rebase
charlesdong1991 May 21, 2021
dc17959
fixup
charlesdong1991 May 21, 2021
4aee3e0
black
charlesdong1991 May 21, 2021
4eb466f
fixup
charlesdong1991 May 21, 2021
5160224
fix mypy
charlesdong1991 May 22, 2021
e2de0d3
fix mypy
charlesdong1991 May 22, 2021
b2b33ac
fix mypy
charlesdong1991 May 22, 2021
1199a93
fix mypy
charlesdong1991 May 22, 2021
c4a5842
fix mypy
charlesdong1991 May 22, 2021
6556414
fix mypy
charlesdong1991 May 22, 2021
826f277
fix flake8
charlesdong1991 May 22, 2021
891dc55
add by support for boxplot
charlesdong1991 May 22, 2021
4c4a158
doc
charlesdong1991 May 22, 2021
ea7e5b1
Add tests
charlesdong1991 May 22, 2021
006588e
flake8
charlesdong1991 May 22, 2021
4f0a1dc
move file
charlesdong1991 May 22, 2021
e1579e2
pprint label
charlesdong1991 May 24, 2021
f2c141f
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 May 27, 2021
5f96abd
rebase and fix conflict
charlesdong1991 May 31, 2021
06483af
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 1, 2021
08f534d
rebase fix conflicts
charlesdong1991 Jun 3, 2021
f1c3a1f
Resolve conflicts and adopt all feedbacks from Marc
charlesdong1991 Jun 29, 2021
e6e96d3
parametrize tests
charlesdong1991 Jun 29, 2021
52e47f1
Fix test
charlesdong1991 Jun 29, 2021
bc2f282
Code changes based on Marc reviews
charlesdong1991 Jun 30, 2021
a43d3bb
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jun 30, 2021
ceeb3c5
update doc
charlesdong1991 Jun 30, 2021
4fea841
version change
charlesdong1991 Jun 30, 2021
3ea2603
Use self.by
charlesdong1991 Jun 30, 2021
9f48139
better code
charlesdong1991 Jun 30, 2021
b1094e3
better inline comment
charlesdong1991 Jun 30, 2021
97bde59
code changes based on Marc reviews
charlesdong1991 Jun 30, 2021
444a964
minor fix
charlesdong1991 Jun 30, 2021
b66dad0
mypy
charlesdong1991 Jun 30, 2021
982f562
add future annotation
charlesdong1991 Jun 30, 2021
c76ad67
fix pre commit
charlesdong1991 Jun 30, 2021
2c1aa33
minor experimental fix
charlesdong1991 Jun 30, 2021
6896546
better doc string
charlesdong1991 Jun 30, 2021
3c54302
fixup doc fail
charlesdong1991 Jul 1, 2021
2d20178
code change on Macro reviews
charlesdong1991 Jul 1, 2021
a169dfd
Add more tests
charlesdong1991 Jul 1, 2021
d0b56ff
fixup
charlesdong1991 Jul 1, 2021
dec313c
code change on reviews
charlesdong1991 Jul 10, 2021
143f286
changes based on Jeff review
charlesdong1991 Jul 12, 2021
283286f
doc
charlesdong1991 Jul 12, 2021
f2a0736
Merge remote-tracking branch 'upstream/master' into fix_by_plot
charlesdong1991 Jul 12, 2021
f1aeee0
fix flake8
charlesdong1991 Jul 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ I/O
Plotting
^^^^^^^^

-
- Implement ``by`` argument for :meth:`DataFrame.plot.hist` (:issue:`15079`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to other enhancements section

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved

- :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`).
-

Expand Down
10 changes: 10 additions & 0 deletions pandas/plotting/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,16 @@ def hist(self, by=None, bins=10, **kwargs):
... columns = ['one'])
>>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
>>> ax = df.plot.hist(bins=12, alpha=0.5)

If `by` is defined, a grouped hist plot is generated:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag 1.1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, added above in parameters section!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is context on what by can be (str, list of str?)

Copy link
Member Author

@charlesdong1991 charlesdong1991 Feb 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is already defined lines above in Parameters section, this is Example section, but I added some words to address it, and add one more example to demonstrate different scenarios


.. plot::
:context: close-figs

>>> np.random.seed(159753)
>>> df = pd.DataFrame(np.random.randn(30, 2), columns=['A', 'B'])
>>> df['C'] = np.random.choice(['a', 'b', 'c'], 30)
>>> ax = df.plot.hist(column=['A', 'B'], by=['C'], figsize=(8, 10))
"""
return self(kind="hist", by=by, bins=bins, **kwargs)

Expand Down
46 changes: 39 additions & 7 deletions pandas/plotting/_matplotlib/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
)
from pandas.core.dtypes.missing import isna, notna

from pandas import MultiIndex
import pandas.core.common as com
from pandas.core.reshape.concat import concat
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can include with the from pandas import

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed!


from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0
Expand Down Expand Up @@ -102,13 +104,15 @@ def __init__(
table=False,
layout=None,
include_bool=False,
column=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you annotate this? I think should just be Label

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing a push? I don't see the change here

**kwds,
):

import matplotlib.pyplot as plt

self.data = data
self.by = by
self.column = [column] if not isinstance(column, list) else column
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor but I think this should be called self.columns since we are dealing with a collection

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i agree, but here the reason i chose column was for consistency reason, now it is aligned with argument name we already have df.hist() where it uses column

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @WillAyd here. I see your point. But column can be a single column or None. This is always a list, so self.columns is better.

When column is None, this will be [None], while I guess should be []. Can you have a look and see if anything should be changed?


self.kind = kind

Expand All @@ -117,7 +121,9 @@ def __init__(
self.subplots = subplots

if sharex is None:
if ax is None:

# if by is defined, subplots are used and sharex should be False
if ax is None and by is None:
self.sharex = True
else:
# if we get an axis, the users should do the visibility
Expand Down Expand Up @@ -240,18 +246,30 @@ def _iter_data(self, data=None, keep_index=False, fillna=None):
if fillna is not None:
data = data.fillna(fillna)

for col, values in data.items():
if keep_index is True:
yield col, values
else:
yield col, values.values
if self.by is None:
for col, values in data.items():
if keep_index is True:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as @datapythonista below

yield col, values
else:
yield col, values.values
else:
cols = data.columns.get_level_values(0).unique()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we only plot the first level of a multi index then? Is this limitation documented anywhere?

Copy link
Member Author

@charlesdong1991 charlesdong1991 Feb 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

emm, I would not call it a limitation, this branch will not be reached even dataset is a MI dataframe while by is not defined (None). The reason to have it is to separate the groups caused by by argument, and produce iterated data for hist to plot, all levels in MI will be plotted, could check it in the tests.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK I think I understand. Doesn't this require by and the first level of the columns to be the same though?

I think this can be more cleary written if you groupby the by argument. If you do that you can then put the for col, val in data.items() loop into a function and just apply that function along the groupby object

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd thanks very much for your comment, appreciate a lot!

this data here in _iter_data has been grouped, could reference to the patch i add here in compute_plot_data:

# GH15079 restructure data if by is defined
        if self.by is not None:
            self.subplots = True
            grouped = data.groupby(self.by)

            data_list = []
            for key, group in grouped:
                columns = MultiIndex.from_product([[key], self.column])
                sub_group = group[self.column]
                sub_group.columns = columns
                data_list.append(sub_group)

            data = concat(data_list, axis=1)

And here, in _iter_data, we need to sub-select columns based on the first level of MI, i simplified this block in _iter_data again to avoid repeatedly use the for loop, and add comment to clarify it, feel free to give another look! thanks!


for col in cols:
data_values = data.loc[:, data.columns.get_level_values(0) == col]
if keep_index is True:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if keep_index is True:
if keep_index:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks still outstanding?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, shame on me, i thought i removed :(

yield col, data_values
else:
yield col, data_values.values

@property
def nseries(self):
if self.data.ndim == 1:
return 1
else:
elif self.by is None:
return self.data.shape[1]
else:
return len(set(self.data.columns.get_level_values(0)))

def draw(self):
self.plt.draw_if_interactive()
Expand Down Expand Up @@ -378,6 +396,20 @@ def _compute_plot_data(self):
label = "None"
data = data.to_frame(name=label)

# GH15079 restructure data if by is defined
if self.by is not None:
self.subplots = True
grouped = data.groupby(self.by)

data_list = []
for key, group in grouped:
columns = MultiIndex.from_product([[key], self.column])
sub_group = group[self.column]
sub_group.columns = columns
data_list.append(sub_group)

data = concat(data_list, axis=1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know in all the other transformations inside this function it's done inline, and you're being consistent. But this is somehow complex, I think it should be in a separate function, so it's clear what's the input, what's the output, we can add a small docstring explaining and with an example. And most importantly, it'll be easier to test properly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, okay, I moved it out as an internal function to transform the data. However, i found it hard to test, so I added a couple inline comments and docstrings, please let me know if there is a good way to test this function out! @datapythonista


# GH16953, _convert is needed as fallback, for ``Series``
# with ``dtype == object``
data = data._convert(datetime=True, timedelta=True)
Expand Down
65 changes: 53 additions & 12 deletions pandas/plotting/_matplotlib/hist.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from typing import Union

import numpy as np

from pandas.core.dtypes.common import is_integer, is_list_like
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass
from pandas.core.dtypes.missing import isna, remove_na_arraylike

import pandas.core.common as com
from pandas.core.series import Series

from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib.core import LinePlot, MPLPlot
Expand All @@ -21,22 +24,38 @@ def __init__(self, data, bins=10, bottom=0, **kwargs):
MPLPlot.__init__(self, data, **kwargs)

def _args_adjust(self):

# calculate bin number separately in different subplots
# where subplots are created based on by argument
if is_integer(self.bins):
# create common bin edge
values = self.data._convert(datetime=True)._get_numeric_data()
values = np.ravel(values)
values = values[~isna(values)]

_, self.bins = np.histogram(
values,
bins=self.bins,
range=self.kwds.get("range", None),
weights=self.kwds.get("weights", None),
)
if self.by is None:
self.bins = self._caculcate_bins(self.data)

else:
grouped = self.data.groupby(self.by)[self.column]
bins_list = []
for key, group in grouped:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use a list-comprehension

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed!

bins_list.append(self._caculcate_bins(group))
self.bins = bins_list

if is_list_like(self.bottom):
self.bottom = np.array(self.bottom)

def _caculcate_bins(self, data: ABCDataFrame) -> np.array:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a typo, caculcate...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! corrected!

"""Calculate bins given data"""

values = data._convert(datetime=True)._get_numeric_data()
values = np.ravel(values)
values = values[~isna(values)]

hist, bins = np.histogram(
values,
bins=self.bins,
range=self.kwds.get("range", None),
weights=self.kwds.get("weights", None),
)
return bins

@classmethod
def _plot(
cls,
Expand All @@ -51,7 +70,6 @@ def _plot(
):
if column_num == 0:
cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
y = y[~isna(y)]

base = np.zeros(len(bins) - 1)
bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
Expand All @@ -77,9 +95,32 @@ def _make_plot(self):
kwds["style"] = style

kwds = self._make_plot_keywords(kwds, y)

# the bins is multi-dimension array now and each plot need only 1-d and
# when by is applied, label should be columns that are grouped
if self.by is not None:
kwds["bins"] = kwds["bins"][i]
kwds["label"] = self.column
kwds.pop("color")

y = self._reformat_y(y)
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)

# when by is applied, show title for subplots to know which group it is
if self.by is not None:
ax.set_title(pprint_thing(label))

self._add_legend_handle(artists[0], label, index=i)

def _reformat_y(self, y: Union[Series, np.array]) -> Union[Series, np.array]:
"""Internal function to reformat y given `by` is applied or not."""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you clarify this? I think known from the function name that it is internal to reformat y, but curious what the existence of by should change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, main difference is the dimension of y will be different, if by is None, y is 1-d array, while if by is not None, which means group by will happen, and y here is multi-dimension array.

i will change to a more detailed description for this.

if self.by is not None and len(y.shape) > 1:
notna = [col[~isna(col)] for col in y.T]
y = np.array(np.array(notna).T)
else:
y = y[~isna(y)]
return y

def _make_plot_keywords(self, kwds, y):
"""merge BoxPlot/KdePlot properties to passed kwds"""
# y is required for KdePlot
Expand Down
97 changes: 97 additions & 0 deletions pandas/tests/plotting/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from datetime import date, datetime
import itertools
import re
import string
import warnings

Expand All @@ -25,6 +26,15 @@
import pandas.plotting as plotting


@pytest.fixture(scope="module")
def test_hist_df():
np.random.seed(0)
df = DataFrame(np.random.randn(30, 2), columns=["A", "B"])
df["C"] = np.random.choice(["a", "b", "c"], 30)
df["D"] = np.random.choice(["a", "b", "c"], 30)
return df


@td.skip_if_no_mpl
class TestDataFramePlots(TestPlotBase):
def setup_method(self, method):
Expand Down Expand Up @@ -3256,6 +3266,93 @@ def test_subplots_sharex_false(self):
tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1)
tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2)

@pytest.mark.parametrize("by", ["C", ["C", "D"]])
@pytest.mark.parametrize("column", ["A", ["A", "B"]])
def test_hist_plot_by_argument(self, by, column, test_hist_df):
# GH 15079
_check_plot_works(test_hist_df.plot.hist, column=column, by=by)

@pytest.mark.slow
@pytest.mark.parametrize(
"by, column, layout, axes_num",
[
(["C"], "A", (2, 2), 3),
("C", "A", (2, 2), 3),
(["C"], ["A"], (1, 3), 3),
("C", ["A", "B"], (3, 1), 3),
(["C", "D"], "A", (9, 1), 9),
(["C", "D"], "A", (3, 3), 9),
(["C", "D"], ["A"], (5, 2), 9),
(["C", "D"], ["A", "B"], (9, 1), 9),
(["C", "D"], ["A", "B"], (5, 2), 9),
],
)
def test_hist_plot_layout_with_by(self, by, column, layout, axes_num, test_hist_df):
# GH 15079
# _check_plot_works adds an ax so catch warning. see GH #13188
with tm.assert_produces_warning(UserWarning):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What warning is this throwing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_check_plot_works adds an ax so catch warning. see GH #13188, and this is the same as we have in test_hist_method

axes = _check_plot_works(
test_hist_df.plot.hist, column=column, by=by, layout=layout
)
self._check_axes_shape(axes, axes_num=axes_num, layout=layout)

def test_hist_plot_invalid_layout_with_by_raises(self, test_hist_df):
# GH 15079, test if error is raised when invalid layout is given

# layout too small for all 3 plots
msg = "larger than required size"
with pytest.raises(ValueError, match=msg):
test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1))

# invalid format for layout
msg = re.escape("Layout must be a tuple of (rows, columns)")
with pytest.raises(ValueError, match=msg):
test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,))

msg = "At least one dimension of layout must be positive"
with pytest.raises(ValueError, match=msg):
test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1))

@pytest.mark.slow
def test_axis_share_x_with_by(self, test_hist_df):
# GH 15079
ax1, ax2, ax3 = test_hist_df.plot.hist(column="A", by="C", sharex=True)

# share x
assert ax1._shared_x_axes.joined(ax1, ax2)
assert ax2._shared_x_axes.joined(ax1, ax2)
assert ax3._shared_x_axes.joined(ax1, ax3)
assert ax3._shared_x_axes.joined(ax2, ax3)

# don't share y
assert not ax1._shared_y_axes.joined(ax1, ax2)
assert not ax2._shared_y_axes.joined(ax1, ax2)
assert not ax3._shared_y_axes.joined(ax1, ax3)
assert not ax3._shared_y_axes.joined(ax2, ax3)

@pytest.mark.slow
def test_axis_share_y_with_by(self, test_hist_df):
# GH 15079
ax1, ax2, ax3 = test_hist_df.plot.hist(column="A", by="C", sharey=True)

# share y
assert ax1._shared_y_axes.joined(ax1, ax2)
assert ax2._shared_y_axes.joined(ax1, ax2)
assert ax3._shared_y_axes.joined(ax1, ax3)
assert ax3._shared_y_axes.joined(ax2, ax3)

# don't share x
assert not ax1._shared_x_axes.joined(ax1, ax2)
assert not ax2._shared_x_axes.joined(ax1, ax2)
assert not ax3._shared_x_axes.joined(ax1, ax3)
assert not ax3._shared_x_axes.joined(ax2, ax3)

@pytest.mark.parametrize("figsize", [(12, 8), (20, 10)])
def test_figure_shape_hist_with_by(self, figsize, test_hist_df):
# GH 15079
axes = test_hist_df.plot.hist(column="A", by="C", figsize=figsize)
self._check_axes_shape(axes, axes_num=3, figsize=figsize)

def test_plot_no_rows(self):
# GH 27758
df = pd.DataFrame(columns=["foo"], dtype=int)
Expand Down