Skip to content

WIP: Add value_counts() to DataFrame #5381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4297,6 +4297,55 @@ def mode(self, axis=0, numeric_only=False):
f = lambda s: s.mode()
return data.apply(f, axis=axis)

def value_counts(self, axis=0, normalize=False, sort=True,
ascending=False, bins=None, numeric_only=False):
"""
Returns DataFrame containing counts of unique values. The resulting
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put the first sentence on a seperate line? (so putting the "The resulting" on the next line. When following the numpy docstring standard exactly, there should even be a blank line after the first sentence.) This will ensure that the summary in the api docs (http://pandas.pydata.org/pandas-docs/dev/api.html) are limited to that one sentence.

DataFrame will be in descending order so that the first element is the
most frequently-occurring element among *all* columns. Excludes NA
values. Maintains order along axis (i.e., column/row)

Parameters
----------
axis : {0, 1, 'index', 'columns'} (default 0)
0/'index' : get value_counts by column
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add - or * in the beginning of these two lines? (as if it is a list). Then in html it will be rendered as a list of two options, instead of all on one line after each other (as is eg the case here: http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.mode.html)

1/'columns' : get value_counts by row
normalize: boolean, default False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

space between 'normalize' and ':'

If True then the Series returned will contain the relative
frequencies of the unique values.
sort : boolean, default True
Sort by sum of counts across columns (if False, DataFrame will be
sorted by union of all the unique values found)
ascending : boolean, default False
Sort in ascending order
bins : integer or sequence of scalars, optional
Rather than count values, group them into half-open bins, a
convenience for pd.cut, only works with numeric data. If integer,
then creates bins based upon overall max and overall min. If
passed, assumes numeric_only.
numeric_only : bool, default False
only apply to numeric columns.

Returns
-------
counts : DataFrame
"""
data = self if not numeric_only else self._get_numeric_data()
from pandas.tools.tile import _generate_bins
if bins is not None and not com._is_sequence(bins):
max_val = self.max().max()
min_val = self.min().min()
bins = _generate_bins(bins=bins, min_val=min_val, max_val=max_val)

f = lambda s: s.value_counts(normalize=normalize, bins=bins)
res = data.apply(f, axis=axis)

if sort:
order = res.sum(1).order(ascending=ascending).index
res = res.reindex(order)

return res

def quantile(self, q=0.5, axis=0, numeric_only=True):
"""
Return values at the given quantile over requested axis, a la
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11253,6 +11253,39 @@ def test_count(self):
expected = Series(0, index=[])
assert_series_equal(result, expected)

def test_value_counts(self):
df = DataFrame({"A": [0, 5, 8, 10, 13], "B": [4, 16, 2, 30, 10]})
expected = DataFrame({"A": pd.Series([1, 1, 1, 1, 1],
index=[0, 5, 8, 10, 13]),
"B": pd.Series([1, 1, 1, 1, 1],
index=[4, 16, 2, 30, 10])})
expected = expected.reindex([10, 30, 16, 13, 8, 5, 4, 2, 0])
assert_frame_equal(df.value_counts(), expected)
df = DataFrame({"A": ['a', 'a', 'a', 'c', 'd', 'e'],
"B": ['e', 'c', 'd', 'x', 'y', 'a']})
actual = df.value_counts()
expected = DataFrame({"A": Series([3, 1, 1, 1], index=['a', 'e', 'd',
'c']),
"B": Series([1, 1, 1, 1, 1, 1],
index=['e', 'c', 'd', 'x', 'y',
'a'])})
expected = expected.ix[expected.sum(1).order(ascending=False).index]
assert_frame_equal(actual, expected)

# finally, with bins

# levels = Index(['(-0.03, 3]', '(3, 6]', '(6, 9]', '(9, 12]',
# '(12, 15]', '(15, 18]', '(18, 21]', '(21, 24]',
# '(24, 27]', '(27, 30]'], dtype=object)
bins = [-0.03, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30]
actual = df.value_counts(bins=bins)
expected = DataFrame({
"A": pd.cut(df["A"], bins=bins).value_counts(),
"B": pd.cut(df["B"], bins=bins).value_counts()
})
expected = expected.ix[expected.sum(1).order(ascending=False).index]
assert_frame_equal(actual, expected)

def test_sum(self):
self._check_stat_op('sum', np.sum, has_numeric_only=True)

Expand Down
89 changes: 57 additions & 32 deletions pandas/tools/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,60 @@

import numpy as np

def _generate_bins(x=None, bins=None, min_val=None, max_val=None, right=True):
"""
Generate bins for cut, must either pass x (an array-like) or a min and max
value. If min or max are passed, ignores x.

Adds .1% space around bins if integer.
"""
if bins is None:
raise ValueError("bins cannot be None.")
# ignore x if min and max are passed
if min_val is not None or max_val is not None:
assert min_val is not None and max_val is not None, (
"Must pass *both* min_val and max_val")
else:
assert x is not None, "Must pass either min/max vals or array-like"

# NOTE: this binning code is changed a bit from histogram for var(x) == 0
if not np.iterable(bins):
if np.isscalar(bins) and bins < 1:
raise ValueError("`bins` should be a positive integer.")
if min_val is not None:
mn, mx = min_val, max_val
else:
try: # for array-like
sz = x.size
except AttributeError:
x = np.asarray(x)
sz = x.size
if sz == 0:
raise ValueError('Cannot cut empty array')
# handle empty arrays. Can't determine range, so use 0-1.
# rng = (0, 1)
else:
rng = (nanops.nanmin(x), nanops.nanmax(x))
mn, mx = [mi + 0.0 for mi in rng]

if mn == mx: # adjust end points before binning
mn -= .001 * mn
mx += .001 * mx
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
else:
bins[-1] += adj

else:
bins = np.asarray(bins)
if (np.diff(bins) < 0).any():
raise ValueError('bins must increase monotonically.')
return bins


def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
include_lowest=False):
Expand Down Expand Up @@ -75,39 +129,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
>>> pd.cut(np.ones(5), 4, labels=False)
array([1, 1, 1, 1, 1], dtype=int64)
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
if not np.iterable(bins):
if np.isscalar(bins) and bins < 1:
raise ValueError("`bins` should be a positive integer.")
try: # for array-like
sz = x.size
except AttributeError:
x = np.asarray(x)
sz = x.size
if sz == 0:
raise ValueError('Cannot cut empty array')
# handle empty arrays. Can't determine range, so use 0-1.
# rng = (0, 1)
else:
rng = (nanops.nanmin(x), nanops.nanmax(x))
mn, mx = [mi + 0.0 for mi in rng]
if x is None:
raise TypeError("Must pass array-like as first argument, not None")

if mn == mx: # adjust end points before binning
mn -= .001 * mn
mx += .001 * mx
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
else:
bins[-1] += adj

else:
bins = np.asarray(bins)
if (np.diff(bins) < 0).any():
raise ValueError('bins must increase monotonically.')
bins = _generate_bins(x, bins, right=right)

return _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision,
include_lowest=include_lowest)
Expand Down