Skip to content

ENH: Implement DataFrame.value_counts #31247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Feb 26, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
0830e36
Add value_counts tests
Jan 23, 2020
d946e93
Update docs
Jan 23, 2020
7d9306d
Start implementing value_counts
Jan 23, 2020
2e58db4
Set MultiIndex name
Jan 23, 2020
25d7f2f
Format
Jan 23, 2020
aa96c98
Sort imports
Jan 23, 2020
aef75ae
Remove typing for now
Jan 23, 2020
acb81cc
Simplify test a little
Jan 23, 2020
786de34
Remove single col example for now
Jan 23, 2020
7eba59a
Update error for bins
Jan 23, 2020
60554e9
Update pandas/core/frame.py
dsaxton Jan 24, 2020
4c4e858
Update pandas/core/frame.py
dsaxton Jan 24, 2020
07f0e76
Import Label type
Jan 24, 2020
d055b5c
Merge remote-tracking branch 'upstream/master' into df-val-counts
Jan 24, 2020
957a8ec
Make Sequence optional
Jan 24, 2020
4fee5e0
Fix docstring
Jan 24, 2020
b8f4126
Clean docstring
Jan 26, 2020
310c688
Update to comments
Jan 26, 2020
a266021
Add to Series See Also
Jan 26, 2020
d738bf7
Update tests and add back tolist
Jan 26, 2020
2618220
Don't import pytest
Jan 26, 2020
98e7e5b
Merge branch 'master' into df-val-counts
Jan 27, 2020
1ab2aeb
Merge branch 'master' into df-val-counts
Jan 27, 2020
a97347f
Merge branch 'master' into df-val-counts
Jan 31, 2020
e12117e
Merge branch 'master' into df-val-counts
Feb 6, 2020
9e75083
Merge branch 'master' into df-val-counts
Feb 8, 2020
0d46697
Add to basics.rst
Feb 9, 2020
81991a1
Move to Notes
Feb 9, 2020
d618677
Merge branch 'master' into df-val-counts
Feb 9, 2020
85bc213
Rename to avoid doc error
Feb 9, 2020
425ef73
Merge branch 'master' into df-val-counts
Feb 13, 2020
b03978c
Merge branch 'master' into df-val-counts
Feb 14, 2020
de043d9
Merge branch 'master' into df-val-counts
Feb 15, 2020
2f0f46d
Merge branch 'master' into df-val-counts
Feb 15, 2020
5544716
Merge branch 'master' into df-val-counts
Feb 16, 2020
12898ad
Merge branch 'master' into df-val-counts
Feb 17, 2020
d743ac2
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 20, 2020
f7c3abe
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 21, 2020
c297143
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 23, 2020
47683ad
Add See Also
dsaxton Feb 23, 2020
e60de83
Move tests
dsaxton Feb 23, 2020
3903a4d
versionadded
dsaxton Feb 23, 2020
9ee6e0e
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 23, 2020
de40484
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ Computations / descriptive stats
DataFrame.std
DataFrame.var
DataFrame.nunique
DataFrame.value_counts

Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Other API changes

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
-

Backwards incompatible API changes
Expand Down
118 changes: 116 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from pandas._config import get_option

from pandas._libs import algos as libalgos, lib, properties
from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer
from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer
from pandas.compat import PY37
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -108,7 +108,7 @@
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.multi import maybe_droplevels
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.core.internals import BlockManager
Expand Down Expand Up @@ -5070,6 +5070,120 @@ def sort_index(
else:
return self._constructor(new_data).__finalize__(self)

def value_counts(
self,
subset: Optional[Sequence[Label]] = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins: Optional[int] = None,
dropna: bool = True,
):
"""
Return a Series containing counts of unique rows in the DataFrame.
.. versionadded:: 1.1.0
The returned Series will have a MultiIndex with one level per input
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should go in Notes

column.
By default, rows that contain any NA values are omitted from the
result.
By default, the resulting Series will be in descending order so that the
first element is the most frequently-occurring row.

Parameters
----------
subset : list-like, optional
Columns to use when counting unique combinations.
normalize : bool, default False
Return proportions rather than frequencies.
sort : bool, default True
Sort by frequencies.
ascending : bool, default False
Sort in ascending order.
bins : int, optional
This parameter is not yet supported and must be set to None (the
default value). It exists to ensure compatibiliy with
`Series.value_counts`.
Rather than count values, group them into half-open bins,
a convenience for ``pd.cut``, only works with single-column numeric
data.
dropna : bool, default True
This parameter is not yet supported and must be set to True (the
default value). It exists to ensure compatibiliy with
`Series.value_counts`.
Don't include counts of rows containing NA values.

Returns
-------
Series

See Also
--------
Series.value_counts: Equivalent method on Series.

Examples
--------
>>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
... 'num_wings': [2, 0, 0, 0]},
... index=['falcon', 'dog', 'cat', 'ant'])
>>> df
num_legs num_wings
falcon 2 2
dog 4 0
cat 4 0
ant 6 0
>>> df.value_counts()
num_legs num_wings
4 0 2
6 0 1
2 2 1
dtype: int64
>>> df.value_counts(sort=False)
num_legs num_wings
2 2 1
4 0 2
6 0 1
dtype: int64
>>> df.value_counts(ascending=True)
num_legs num_wings
2 2 1
6 0 1
4 0 2
dtype: int64
>>> df.value_counts(normalize=True)
num_legs num_wings
4 0 0.50
6 0 0.25
2 2 0.25
dtype: float64
"""
if subset is None:
subset = self.columns.tolist()

# Some features not supported yet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would remove these args then

if not dropna:
raise NotImplementedError(
"`dropna=False` not yet supported for DataFrames."
)

if bins is not None:
raise NotImplementedError(
"`bins` parameter not yet supported for DataFrames."
)

counts = self.groupby(subset).size()

if sort:
counts = counts.sort_values(ascending=ascending)
if normalize:
counts /= counts.sum()
# Force MultiIndex for single column
if len(subset) == 1:
counts.index = MultiIndex.from_arrays(
[counts.index], names=[counts.index.name]
)

return counts

def nlargest(self, n, columns, keep="first") -> "DataFrame":
"""
Return the first `n` rows ordered by `columns` in descending order.
Expand Down
123 changes: 123 additions & 0 deletions pandas/tests/frame/test_value_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import numpy as np
import pytest

import pandas as pd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to the methods/ subdir

import pandas._testing as tm


def test_data_frame_value_counts_unsorted():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts(sort=False)
expected = pd.Series(
data=[1, 2, 1],
index=pd.MultiIndex.from_arrays(
[(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_ascending():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts(ascending=True)
expected = pd.Series(
data=[1, 1, 2],
index=pd.MultiIndex.from_arrays(
[(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_default():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays(
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_normalize():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts(normalize=True)
expected = pd.Series(
data=[0.5, 0.25, 0.25],
index=pd.MultiIndex.from_arrays(
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_dropna_not_supported_yet():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

with pytest.raises(NotImplementedError, match="not yet supported"):
df.value_counts(dropna=False)


def test_data_frame_value_counts_bins_not_supported():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

with pytest.raises(NotImplementedError, match="not yet supported"):
df.value_counts(bins=2)


def test_data_frame_value_counts_single_col_default():
df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})

result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_empty():
df_no_cols = pd.DataFrame()

result = df_no_cols.value_counts()
expected = pd.Series([], dtype=np.int64)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_empty_normalize():
df_no_cols = pd.DataFrame()

result = df_no_cols.value_counts(normalize=True)
expected = pd.Series([], dtype=np.float64)

tm.assert_series_equal(result, expected)