Skip to content

Add HTML repr for groupby dataframe and series #34926

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
5e1cb8c
Add html repr for groupby dataframe and series
JulianWgs Jun 22, 2020
9c3df8a
Sort imports with isort in groupby.py
JulianWgs Jun 22, 2020
4a3911a
Improve variable naming
JulianWgs Jul 25, 2020
46f5353
Add display.max_groups to config
JulianWgs Jul 25, 2020
c840c29
Add group display truncation for too many groups
JulianWgs Jul 25, 2020
139bdc6
Implement faster and more scalable list variant
JulianWgs Jul 25, 2020
1020be9
Black config_init
JulianWgs Jul 25, 2020
2e4a6ee
Fix bug which displayed too few rows
JulianWgs Jul 25, 2020
ea2f151
Add test for groupby representation
JulianWgs Jul 26, 2020
2443b80
Delete trailing whitespace in comment
JulianWgs Jul 26, 2020
913afb0
Add test cases for truncated rows and groups
JulianWgs Jul 26, 2020
7efc505
Merge remote-tracking branch 'upstream/master'
JulianWgs Aug 24, 2020
d85fc63
Skip test if lxml is not installed
JulianWgs Sep 3, 2020
778d90d
Move html repr function to io/formats/format.py
JulianWgs Sep 5, 2020
9736007
Add doc string and return type annotation
JulianWgs Sep 5, 2020
7f1937c
Add type annotations for input arg
JulianWgs Sep 5, 2020
388f35d
Fix linting errors
JulianWgs Sep 5, 2020
228e659
Move import to the correct location
JulianWgs Sep 5, 2020
dee1220
Remove pandas type annotations
JulianWgs Oct 3, 2020
57b8bf3
Merge remote-tracking branch 'upstream/master'
JulianWgs Oct 26, 2020
2c5c394
Remove inconsistent use of pd namespace in tests
JulianWgs Oct 28, 2020
669c047
Fix typo and capitalize pandas objs correctly
JulianWgs Oct 30, 2020
8a75299
Change docstring to comment in groupby repr test
JulianWgs Oct 30, 2020
b36177d
Add additional explanation in groupby_repr test
JulianWgs Oct 30, 2020
edff21d
Test more rows in groupby repr when truncated
JulianWgs Nov 11, 2020
580d09b
Test more groups in groupby repr when truncated
JulianWgs Nov 11, 2020
0c948e1
Refactor groups repr html
JulianWgs Jan 3, 2021
e41ff00
Merge remote-tracking branch 'upstream/master'
JulianWgs Jan 3, 2021
ae8721d
Add whatsnew entry for group-by HTML representation
JulianWgs Jan 3, 2021
1c92ed8
Fix test case name
JulianWgs Jan 3, 2021
b92d61f
Rename groupby objects
JulianWgs Jan 3, 2021
579998a
Add case for single and tuple groupby key
JulianWgs Jan 3, 2021
8d8b260
Merge remote-tracking branch 'upstream/master'
JulianWgs Jan 3, 2021
5865cfb
Merge branch 'master' into master
JulianWgs Jul 4, 2021
7a11be8
Move whats new to 1.4.0 release
JulianWgs Jul 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enhancement2

Other enhancements
^^^^^^^^^^^^^^^^^^
- Added HTML representation for grouped DataFrame and Series (:issue:`34926`)
-
-

Expand Down
9 changes: 9 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ def use_numba_cb(key):
correct auto-detection.
"""

pc_max_groups_doc = """
: int
If max_groups is exceeded, switch to truncate groupby view. 'None' value
means unlimited.
"""

pc_min_rows_doc = """
: int
The numbers of rows to show in a truncated view (when `max_rows` is
Expand Down Expand Up @@ -355,6 +361,9 @@ def is_terminal() -> bool:
validator=is_instance_factory((int, type(None))),
)
cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int)
cf.register_option(
"max_groups", 10, pc_max_groups_doc, validator=is_nonnegative_int
)
cf.register_option(
"min_rows",
10,
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ class providing the base-class of operations.
if TYPE_CHECKING:
from typing import Literal

from pandas.io.formats.format import repr_html_groupby

_common_see_also = """
See Also
--------
Expand Down Expand Up @@ -601,6 +603,9 @@ def __repr__(self) -> str:
# TODO: Better repr for GroupBy object
return object.__repr__(self)

def _repr_html_(self) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to reorg this to use a GroupbyFormatter located in pandas/io/formats/groupby.py (it can do pretty much this but just locate the code there) as this is where we keep all of the formatting code.

could also add a .to_string() method but not sure that's actually worth it (maybe open an issue for that).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank for the review! Do you mean pandas/io/formats/html.py? Should I add a new function and then just call that function from the above location?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, i mean pandas/io/formats/format.py (ok to just shove in there is fine, we should split that file up but that's for later).

return repr_html_groupby(self)

@final
@property
def groups(self) -> dict[Hashable, np.ndarray]:
Expand Down
39 changes: 39 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2086,3 +2086,42 @@ def buffer_put_lines(buf: IO[str], lines: list[str]) -> None:
if any(isinstance(x, str) for x in lines):
lines = [str(x) for x in lines]
buf.write("\n".join(lines))


def repr_html_groupby(group_obj) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should use the same machines as DataFrameFormatter/DataFrameRenderer (subclass as appropriate), which was recently changed).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, for the long inactivity. I don't get how I would use the DataFrameFormatter? Is there documentation on this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can find examples in pandas/io/formats/latex.py and in other IO methods (grepping for DataFrameFormatter will get you the lot)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, for coming back to this again, but I really dont get what code I should change or how? Could you tell me which line in my code I have to rewrite?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this still relevant? I still would need some guidance :)

"""
Create an HTML representation for a grouped DataFrame or Series.

Parameters
----------
group_obj : [DataFrameGroupBy, SeriesGroupBy]
Object to make HTML representation of.
Returns
-------
str :
HTML representation of the input object.
"""
max_groups = get_option("display.max_groups")
max_rows = max(
1, get_option("display.max_rows") // min(max_groups, group_obj.ngroups)
)
group_names = list(group_obj.groups.keys())
truncated = max_groups < group_obj.ngroups
if truncated:
n_start = (max_groups + 1) // 2
n_end = max_groups - n_start
group_names = group_names[:n_start] + group_names[-n_end:]
repr_html_list = list()
for group_name in group_names:
if not isinstance(group_name, tuple):
group = group_obj.get_group((group_name, ))
else:
group = group_obj.get_group(group_name)
if not hasattr(group, "to_html"):
group = group.to_frame()
repr_html_list.append(
f"<H3>Group Key: {group_name}<H3/>\n{group.to_html(max_rows=max_rows)}"
)
if truncated:
repr_html_list.insert(max_groups // 2, "<H3>...<H3/>")
return "\n".join(repr_html_list)
40 changes: 40 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pandas.compat import IS64
from pandas.errors import PerformanceWarning
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -2226,6 +2227,45 @@ def test_groups_repr_truncates(max_seq_items, expected):
assert result == expected


@td.skip_if_no("lxml")
@pytest.mark.parametrize(
"n_groups,n_rows,check_n_groups,check_n_rows",
[
(10, 60, 5, 3), # All groups and all rows in the groups are shown
(25, 100, 5, 2), # Not all groups are shown
(4, 400, 2, 7), # Not all rows are shown in the groups
(20, 400, 5, 3), # Not all groups and not all rows in the groups are shown
],
)
def test_groupby_repr(n_groups, n_rows, check_n_groups, check_n_rows):
# GH 34926
df = DataFrame(
{
"A": range(n_rows),
"B": range(0, n_rows * 2, 2),
"C": list(range(n_groups)) * (n_rows // n_groups),
}
)

gb = df.groupby("C")

df_from_html = pd.concat(pd.read_html(StringIO(gb._repr_html_()), index_col=0))

# Drop "..." rows and convert index and data to int
df_from_html = df_from_html[df_from_html.index != "..."].astype(int)
df_from_html.index = df_from_html.index.astype(int)

# Iterate over the first and last "check_n_groups" groups
gb_iter = list(gb)[:check_n_groups] + list(gb)[-check_n_groups:]
for group_name, df_group in gb_iter:
# Iterate over the first and last "check_n_rows" of every group
df_iter = pd.concat(
[df_group.iloc[:check_n_rows], df_group.iloc[-check_n_rows:]]
).iterrows()
for index, row in df_iter:
tm.assert_series_equal(row, df_from_html.loc[index])


def test_group_on_two_row_multiindex_returns_one_tuple_key():
# GH 18451
df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
Expand Down