pandas-dev · jreback · Jul 12, 2021 · Dec 3, 2018 · Jan 19, 2019 · Jul 30, 2019
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -190,6 +190,8 @@ I/O
 Plotting
 ^^^^^^^^
 
+-
+- Implement ``by`` argument for :meth:`DataFrame.plot.hist` (:issue:`15079`)
 - :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`).
 -
 

diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -1209,6 +1209,16 @@ def hist(self, by=None, bins=10, **kwargs):
             ...     columns = ['one'])
             >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
             >>> ax = df.plot.hist(bins=12, alpha=0.5)
+
+        If `by` is defined, a grouped hist plot is generated:
+
+        .. plot::
+            :context: close-figs
+
+            >>> np.random.seed(159753)
+            >>> df = pd.DataFrame(np.random.randn(30, 2), columns=['A', 'B'])
+            >>> df['C'] = np.random.choice(['a', 'b', 'c'], 30)
+            >>> ax = df.plot.hist(column=['A', 'B'], by=['C'], figsize=(8, 10))
         """
         return self(kind="hist", by=by, bins=bins, **kwargs)
 

diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -23,7 +23,9 @@
 )
 from pandas.core.dtypes.missing import isna, notna
 
+from pandas import MultiIndex
 import pandas.core.common as com
+from pandas.core.reshape.concat import concat
 
 from pandas.io.formats.printing import pprint_thing
 from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0
@@ -102,13 +104,15 @@ def __init__(
         table=False,
         layout=None,
         include_bool=False,
+        column=None,
         **kwds,
     ):
 
         import matplotlib.pyplot as plt
 
         self.data = data
         self.by = by
+        self.column = [column] if not isinstance(column, list) else column
 
         self.kind = kind
 
@@ -117,7 +121,9 @@ def __init__(
         self.subplots = subplots
 
         if sharex is None:
-            if ax is None:
+
+            # if by is defined, subplots are used and sharex should be False
+            if ax is None and by is None:
                 self.sharex = True
             else:
                 # if we get an axis, the users should do the visibility
@@ -240,18 +246,30 @@ def _iter_data(self, data=None, keep_index=False, fillna=None):
         if fillna is not None:
             data = data.fillna(fillna)
 
-        for col, values in data.items():
-            if keep_index is True:
-                yield col, values
-            else:
-                yield col, values.values
+        if self.by is None:
+            for col, values in data.items():
+                if keep_index is True:
+                    yield col, values
+                else:
+                    yield col, values.values
+        else:
+            cols = data.columns.get_level_values(0).unique()
+
+            for col in cols:
+                data_values = data.loc[:, data.columns.get_level_values(0) == col]
+                if keep_index is True:
-                if keep_index is True:
+                if keep_index:
-                if keep_index is True:
+                if keep_index:
+                    yield col, data_values
+                else:
+                    yield col, data_values.values
 
     @property
     def nseries(self):
         if self.data.ndim == 1:
             return 1
-        else:
+        elif self.by is None:
             return self.data.shape[1]
+        else:
+            return len(set(self.data.columns.get_level_values(0)))
 
     def draw(self):
         self.plt.draw_if_interactive()
@@ -378,6 +396,20 @@ def _compute_plot_data(self):
                 label = "None"
             data = data.to_frame(name=label)
 
+        # GH15079 restructure data if by is defined
+        if self.by is not None:
+            self.subplots = True
+            grouped = data.groupby(self.by)
+
+            data_list = []
+            for key, group in grouped:
+                columns = MultiIndex.from_product([[key], self.column])
+                sub_group = group[self.column]
+                sub_group.columns = columns
+                data_list.append(sub_group)
+
+            data = concat(data_list, axis=1)
+
         # GH16953, _convert is needed as fallback, for ``Series``
         # with ``dtype == object``
         data = data._convert(datetime=True, timedelta=True)

diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
@@ -1,10 +1,13 @@
+from typing import Union
+
 import numpy as np
 
 from pandas.core.dtypes.common import is_integer, is_list_like
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass
 from pandas.core.dtypes.missing import isna, remove_na_arraylike
 
 import pandas.core.common as com
+from pandas.core.series import Series
 
 from pandas.io.formats.printing import pprint_thing
 from pandas.plotting._matplotlib.core import LinePlot, MPLPlot
@@ -21,22 +24,38 @@ def __init__(self, data, bins=10, bottom=0, **kwargs):
         MPLPlot.__init__(self, data, **kwargs)
 
     def _args_adjust(self):
+
+        # calculate bin number separately in different subplots
+        # where subplots are created based on by argument
         if is_integer(self.bins):
-            # create common bin edge
-            values = self.data._convert(datetime=True)._get_numeric_data()
-            values = np.ravel(values)
-            values = values[~isna(values)]
-
-            _, self.bins = np.histogram(
-                values,
-                bins=self.bins,
-                range=self.kwds.get("range", None),
-                weights=self.kwds.get("weights", None),
-            )
+            if self.by is None:
+                self.bins = self._caculcate_bins(self.data)
+
+            else:
+                grouped = self.data.groupby(self.by)[self.column]
+                bins_list = []
+                for key, group in grouped:
+                    bins_list.append(self._caculcate_bins(group))
+                self.bins = bins_list
 
         if is_list_like(self.bottom):
             self.bottom = np.array(self.bottom)
 
+    def _caculcate_bins(self, data: ABCDataFrame) -> np.array:
+        """Calculate bins given data"""
+
+        values = data._convert(datetime=True)._get_numeric_data()
+        values = np.ravel(values)
+        values = values[~isna(values)]
+
+        hist, bins = np.histogram(
+            values,
+            bins=self.bins,
+            range=self.kwds.get("range", None),
+            weights=self.kwds.get("weights", None),
+        )
+        return bins
+
     @classmethod
     def _plot(
         cls,
@@ -51,7 +70,6 @@ def _plot(
     ):
         if column_num == 0:
             cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
-        y = y[~isna(y)]
 
         base = np.zeros(len(bins) - 1)
         bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
@@ -77,9 +95,32 @@ def _make_plot(self):
                 kwds["style"] = style
 
             kwds = self._make_plot_keywords(kwds, y)
+
+            # the bins is multi-dimension array now and each plot need only 1-d and
+            # when by is applied, label should be columns that are grouped
+            if self.by is not None:
+                kwds["bins"] = kwds["bins"][i]
+                kwds["label"] = self.column
+                kwds.pop("color")
+
+            y = self._reformat_y(y)
             artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
+
+            # when by is applied, show title for subplots to know which group it is
+            if self.by is not None:
+                ax.set_title(pprint_thing(label))
+
             self._add_legend_handle(artists[0], label, index=i)
 
+    def _reformat_y(self, y: Union[Series, np.array]) -> Union[Series, np.array]:
+        """Internal function to reformat y given `by` is applied or not."""
+        if self.by is not None and len(y.shape) > 1:
+            notna = [col[~isna(col)] for col in y.T]
+            y = np.array(np.array(notna).T)
+        else:
+            y = y[~isna(y)]
+        return y
+
     def _make_plot_keywords(self, kwds, y):
         """merge BoxPlot/KdePlot properties to passed kwds"""
         # y is required for KdePlot

diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py
@@ -4,6 +4,7 @@
 
 from datetime import date, datetime
 import itertools
+import re
 import string
 import warnings
 
@@ -25,6 +26,15 @@
 import pandas.plotting as plotting
 
 
+@pytest.fixture(scope="module")
+def test_hist_df():
+    np.random.seed(0)
+    df = DataFrame(np.random.randn(30, 2), columns=["A", "B"])
+    df["C"] = np.random.choice(["a", "b", "c"], 30)
+    df["D"] = np.random.choice(["a", "b", "c"], 30)
+    return df
+
+
 @td.skip_if_no_mpl
 class TestDataFramePlots(TestPlotBase):
     def setup_method(self, method):
@@ -3256,6 +3266,93 @@ def test_subplots_sharex_false(self):
         tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1)
         tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2)
 
+    @pytest.mark.parametrize("by", ["C", ["C", "D"]])
+    @pytest.mark.parametrize("column", ["A", ["A", "B"]])
+    def test_hist_plot_by_argument(self, by, column, test_hist_df):
+        # GH 15079
+        _check_plot_works(test_hist_df.plot.hist, column=column, by=by)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "by, column, layout, axes_num",
+        [
+            (["C"], "A", (2, 2), 3),
+            ("C", "A", (2, 2), 3),
+            (["C"], ["A"], (1, 3), 3),
+            ("C", ["A", "B"], (3, 1), 3),
+            (["C", "D"], "A", (9, 1), 9),
+            (["C", "D"], "A", (3, 3), 9),
+            (["C", "D"], ["A"], (5, 2), 9),
+            (["C", "D"], ["A", "B"], (9, 1), 9),
+            (["C", "D"], ["A", "B"], (5, 2), 9),
+        ],
+    )
+    def test_hist_plot_layout_with_by(self, by, column, layout, axes_num, test_hist_df):
+        # GH 15079
+        # _check_plot_works adds an ax so catch warning. see GH #13188
+        with tm.assert_produces_warning(UserWarning):
+            axes = _check_plot_works(
+                test_hist_df.plot.hist, column=column, by=by, layout=layout
+            )
+        self._check_axes_shape(axes, axes_num=axes_num, layout=layout)
+
+    def test_hist_plot_invalid_layout_with_by_raises(self, test_hist_df):
+        # GH 15079, test if error is raised when invalid layout is given
+
+        # layout too small for all 3 plots
+        msg = "larger than required size"
+        with pytest.raises(ValueError, match=msg):
+            test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1))
+
+        # invalid format for layout
+        msg = re.escape("Layout must be a tuple of (rows, columns)")
+        with pytest.raises(ValueError, match=msg):
+            test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,))
+
+        msg = "At least one dimension of layout must be positive"
+        with pytest.raises(ValueError, match=msg):
+            test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1))
+
+    @pytest.mark.slow
+    def test_axis_share_x_with_by(self, test_hist_df):
+        # GH 15079
+        ax1, ax2, ax3 = test_hist_df.plot.hist(column="A", by="C", sharex=True)
+
+        # share x
+        assert ax1._shared_x_axes.joined(ax1, ax2)
+        assert ax2._shared_x_axes.joined(ax1, ax2)
+        assert ax3._shared_x_axes.joined(ax1, ax3)
+        assert ax3._shared_x_axes.joined(ax2, ax3)
+
+        # don't share y
+        assert not ax1._shared_y_axes.joined(ax1, ax2)
+        assert not ax2._shared_y_axes.joined(ax1, ax2)
+        assert not ax3._shared_y_axes.joined(ax1, ax3)
+        assert not ax3._shared_y_axes.joined(ax2, ax3)
+
+    @pytest.mark.slow
+    def test_axis_share_y_with_by(self, test_hist_df):
+        # GH 15079
+        ax1, ax2, ax3 = test_hist_df.plot.hist(column="A", by="C", sharey=True)
+
+        # share y
+        assert ax1._shared_y_axes.joined(ax1, ax2)
+        assert ax2._shared_y_axes.joined(ax1, ax2)
+        assert ax3._shared_y_axes.joined(ax1, ax3)
+        assert ax3._shared_y_axes.joined(ax2, ax3)
+
+        # don't share x
+        assert not ax1._shared_x_axes.joined(ax1, ax2)
+        assert not ax2._shared_x_axes.joined(ax1, ax2)
+        assert not ax3._shared_x_axes.joined(ax1, ax3)
+        assert not ax3._shared_x_axes.joined(ax2, ax3)
+
+    @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)])
+    def test_figure_shape_hist_with_by(self, figsize, test_hist_df):
+        # GH 15079
+        axes = test_hist_df.plot.hist(column="A", by="C", figsize=figsize)
+        self._check_axes_shape(axes, axes_num=3, figsize=figsize)
+
     def test_plot_no_rows(self):
         # GH 27758
         df = pd.DataFrame(columns=["foo"], dtype=int)