-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Implement DataFrame.value_counts #31247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
0830e36
d946e93
7d9306d
2e58db4
25d7f2f
aa96c98
aef75ae
acb81cc
786de34
7eba59a
60554e9
4c4e858
07f0e76
d055b5c
957a8ec
4fee5e0
b8f4126
310c688
a266021
d738bf7
2618220
98e7e5b
1ab2aeb
a97347f
e12117e
9e75083
0d46697
81991a1
d618677
85bc213
425ef73
b03978c
de043d9
2f0f46d
5544716
12898ad
d743ac2
f7c3abe
c297143
47683ad
e60de83
3903a4d
9ee6e0e
de40484
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,7 +39,7 @@ | |
from pandas._config import get_option | ||
|
||
from pandas._libs import algos as libalgos, lib, properties | ||
from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer | ||
from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer | ||
from pandas.compat import PY37 | ||
from pandas.compat._optional import import_optional_dependency | ||
from pandas.compat.numpy import function as nv | ||
|
@@ -108,7 +108,7 @@ | |
from pandas.core.indexes import base as ibase | ||
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences | ||
from pandas.core.indexes.datetimes import DatetimeIndex | ||
from pandas.core.indexes.multi import maybe_droplevels | ||
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels | ||
from pandas.core.indexes.period import PeriodIndex | ||
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable | ||
from pandas.core.internals import BlockManager | ||
|
@@ -5070,6 +5070,120 @@ def sort_index( | |
else: | ||
return self._constructor(new_data).__finalize__(self) | ||
|
||
def value_counts( | ||
self, | ||
subset: Optional[Sequence[Label]] = None, | ||
normalize: bool = False, | ||
sort: bool = True, | ||
ascending: bool = False, | ||
bins: Optional[int] = None, | ||
dropna: bool = True, | ||
): | ||
""" | ||
Return a Series containing counts of unique rows in the DataFrame. | ||
.. versionadded:: 1.1.0 | ||
dsaxton marked this conversation as resolved.
Show resolved
Hide resolved
|
||
The returned Series will have a MultiIndex with one level per input | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should go in Notes |
||
column. | ||
By default, rows that contain any NA values are omitted from the | ||
result. | ||
By default, the resulting Series will be in descending order so that the | ||
first element is the most frequently-occurring row. | ||
|
||
Parameters | ||
---------- | ||
subset : list-like, optional | ||
Columns to use when counting unique combinations. | ||
normalize : bool, default False | ||
Return proportions rather than frequencies. | ||
sort : bool, default True | ||
Sort by frequencies. | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ascending : bool, default False | ||
Sort in ascending order. | ||
bins : int, optional | ||
This parameter is not yet supported and must be set to None (the | ||
default value). It exists to ensure compatibiliy with | ||
`Series.value_counts`. | ||
Rather than count values, group them into half-open bins, | ||
a convenience for ``pd.cut``, only works with single-column numeric | ||
data. | ||
dropna : bool, default True | ||
This parameter is not yet supported and must be set to True (the | ||
default value). It exists to ensure compatibiliy with | ||
`Series.value_counts`. | ||
Don't include counts of rows containing NA values. | ||
|
||
Returns | ||
------- | ||
Series | ||
|
||
See Also | ||
-------- | ||
Series.value_counts: Equivalent method on Series. | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], | ||
... 'num_wings': [2, 0, 0, 0]}, | ||
... index=['falcon', 'dog', 'cat', 'ant']) | ||
>>> df | ||
num_legs num_wings | ||
falcon 2 2 | ||
dog 4 0 | ||
cat 4 0 | ||
ant 6 0 | ||
>>> df.value_counts() | ||
dsaxton marked this conversation as resolved.
Show resolved
Hide resolved
|
||
num_legs num_wings | ||
4 0 2 | ||
6 0 1 | ||
2 2 1 | ||
dtype: int64 | ||
>>> df.value_counts(sort=False) | ||
num_legs num_wings | ||
2 2 1 | ||
4 0 2 | ||
6 0 1 | ||
dtype: int64 | ||
>>> df.value_counts(ascending=True) | ||
num_legs num_wings | ||
2 2 1 | ||
6 0 1 | ||
4 0 2 | ||
dtype: int64 | ||
>>> df.value_counts(normalize=True) | ||
num_legs num_wings | ||
4 0 0.50 | ||
6 0 0.25 | ||
2 2 0.25 | ||
dtype: float64 | ||
""" | ||
if subset is None: | ||
subset = self.columns.tolist() | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Some features not supported yet | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i would remove these args then |
||
if not dropna: | ||
raise NotImplementedError( | ||
"`dropna=False` not yet supported for DataFrames." | ||
) | ||
|
||
if bins is not None: | ||
raise NotImplementedError( | ||
"`bins` parameter not yet supported for DataFrames." | ||
) | ||
|
||
counts = self.groupby(subset).size() | ||
|
||
if sort: | ||
counts = counts.sort_values(ascending=ascending) | ||
if normalize: | ||
counts /= counts.sum() | ||
# Force MultiIndex for single column | ||
dsaxton marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if len(subset) == 1: | ||
counts.index = MultiIndex.from_arrays( | ||
[counts.index], names=[counts.index.name] | ||
) | ||
|
||
return counts | ||
|
||
def nlargest(self, n, columns, keep="first") -> "DataFrame": | ||
""" | ||
Return the first `n` rows ordered by `columns` in descending order. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas as pd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move to the methods/ subdir |
||
import pandas._testing as tm | ||
|
||
|
||
def test_data_frame_value_counts_unsorted(): | ||
df = pd.DataFrame( | ||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
index=["falcon", "dog", "cat", "ant"], | ||
) | ||
|
||
result = df.value_counts(sort=False) | ||
expected = pd.Series( | ||
data=[1, 2, 1], | ||
index=pd.MultiIndex.from_arrays( | ||
[(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] | ||
), | ||
) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_data_frame_value_counts_ascending(): | ||
df = pd.DataFrame( | ||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
index=["falcon", "dog", "cat", "ant"], | ||
) | ||
|
||
result = df.value_counts(ascending=True) | ||
expected = pd.Series( | ||
data=[1, 1, 2], | ||
index=pd.MultiIndex.from_arrays( | ||
[(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] | ||
), | ||
) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_data_frame_value_counts_default(): | ||
df = pd.DataFrame( | ||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
index=["falcon", "dog", "cat", "ant"], | ||
) | ||
|
||
result = df.value_counts() | ||
expected = pd.Series( | ||
data=[2, 1, 1], | ||
index=pd.MultiIndex.from_arrays( | ||
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] | ||
), | ||
) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_data_frame_value_counts_normalize(): | ||
df = pd.DataFrame( | ||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
index=["falcon", "dog", "cat", "ant"], | ||
) | ||
|
||
result = df.value_counts(normalize=True) | ||
expected = pd.Series( | ||
data=[0.5, 0.25, 0.25], | ||
index=pd.MultiIndex.from_arrays( | ||
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] | ||
), | ||
) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_data_frame_value_counts_dropna_not_supported_yet(): | ||
df = pd.DataFrame( | ||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
index=["falcon", "dog", "cat", "ant"], | ||
) | ||
|
||
with pytest.raises(NotImplementedError, match="not yet supported"): | ||
df.value_counts(dropna=False) | ||
|
||
|
||
def test_data_frame_value_counts_bins_not_supported(): | ||
df = pd.DataFrame( | ||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
index=["falcon", "dog", "cat", "ant"], | ||
) | ||
|
||
with pytest.raises(NotImplementedError, match="not yet supported"): | ||
df.value_counts(bins=2) | ||
|
||
|
||
def test_data_frame_value_counts_single_col_default(): | ||
df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) | ||
|
||
result = df.value_counts() | ||
expected = pd.Series( | ||
data=[2, 1, 1], | ||
index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), | ||
) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_data_frame_value_counts_empty(): | ||
df_no_cols = pd.DataFrame() | ||
|
||
result = df_no_cols.value_counts() | ||
expected = pd.Series([], dtype=np.int64) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_data_frame_value_counts_empty_normalize(): | ||
df_no_cols = pd.DataFrame() | ||
|
||
result = df_no_cols.value_counts(normalize=True) | ||
expected = pd.Series([], dtype=np.float64) | ||
|
||
tm.assert_series_equal(result, expected) |
Uh oh!
There was an error while loading. Please reload this page.