Skip to content

Commit f5e9aeb

Browse files
committed
include nan count when dropna=False
1 parent 0159cba commit f5e9aeb

File tree

3 files changed

+53
-12
lines changed

3 files changed

+53
-12
lines changed

pandas/core/algorithms.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -693,12 +693,16 @@ def value_counts(
693693
ascending : bool, default False
694694
Sort in ascending order
695695
normalize: bool, default False
696-
If True then compute a relative histogram
697-
bins : integer, optional
698-
Rather than count values, group them into half-open bins,
699-
convenience for pd.cut, only works with numeric data
696+
If True, then compute a relative histogram that outputs the
697+
proportion of each value.
698+
bins : integer or iterable of numeric, optional
699+
Rather than count values, group them into half-open bins.
700+
Only works with numeric data.
701+
If int, interpreted as number of bins and will use pd.cut.
702+
If interable of numeric, will use provided numbers as bin endpoints.
700703
dropna : bool, default True
701-
Don't include counts of NaN
704+
Don't include counts of NaN.
705+
If False and NaNs are present, NaN will be a key in the output.
702706
703707
Returns
704708
-------
@@ -717,18 +721,17 @@ def value_counts(
717721
except TypeError as err:
718722
raise TypeError("bins argument only works with numeric data.") from err
719723

720-
# count, remove nulls (from the index), and but the bins
724+
# count, remove nulls (from the index), and use the bins
721725
result = ii.value_counts(dropna=dropna)
722-
result = result[result.index.notna()]
723726
result.index = result.index.astype("interval")
724727
result = result.sort_index()
725728

726729
# if we are dropna and we have NO values
727730
if dropna and (result._values == 0).all():
728731
result = result.iloc[0:0]
729732

730-
# normalizing is by len of all (regardless of dropna)
731-
counts = np.array([len(ii)])
733+
# normalizing is by len of what gets included in the bins
734+
counts = result._values
732735

733736
else:
734737

pandas/core/base.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,11 +1180,14 @@ def value_counts(
11801180
Sort by frequencies.
11811181
ascending : bool, default False
11821182
Sort in ascending order.
1183-
bins : int, optional
1184-
Rather than count values, group them into half-open bins,
1185-
a convenience for ``pd.cut``, only works with numeric data.
1183+
bins : integer or iterable of numeric, optional
1184+
Rather than count individual values, group them into half-open bins.
1185+
Only works with numeric data.
1186+
If int, interpreted as number of bins and will use `pd.cut`.
1187+
If interable of numeric, will use provided numbers as bin endpoints.
11861188
dropna : bool, default True
11871189
Don't include counts of NaN.
1190+
If False and NaNs are present, NaN will be a key in the output.
11881191
11891192
Returns
11901193
-------
@@ -1230,6 +1233,13 @@ def value_counts(
12301233
(3.0, 4.0] 1
12311234
dtype: int64
12321235
1236+
Bins can also be an iterable of numbers. These numbers are treated
1237+
as endpoints for the intervals.
1238+
>>> s.value_counts(bins=[0,2,4,9])
1239+
(2.0, 4.0] 3
1240+
(-0.001, 2.0] 2
1241+
(4.0, 9.0] 0
1242+
dtype: int64
12331243
**dropna**
12341244
12351245
With `dropna` set to `False` we can also see NaN index values.

pandas/tests/base/test_value_counts.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,34 @@ def test_value_counts_bins(index_or_series):
191191
assert s.nunique() == 0
192192

193193

194+
def test_value_counts_bins_nas():
195+
# GH25970, handle normalizing bins with NA's properly
196+
# First test that NA's are included appropriately
197+
rand_data = np.append(
198+
np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20)
199+
)
200+
s = Series(rand_data)
201+
assert s.value_counts(dropna=False).index.hasnans
202+
assert not s.value_counts(dropna=True).index.hasnans
203+
assert s.value_counts(dropna=False, bins=3).index.hasnans
204+
assert not s.value_counts(dropna=True, bins=3).index.hasnans
205+
assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans
206+
assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans
207+
208+
# then verify specific example
209+
s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5])
210+
intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0])
211+
expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2]))
212+
expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2])
213+
tm.assert_series_equal(
214+
s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna
215+
)
216+
tm.assert_numpy_array_equal(
217+
s2.value_counts(dropna=False, normalize=True, bins=3).values,
218+
expected_keepna_vals,
219+
)
220+
221+
194222
def test_value_counts_datetime64(index_or_series):
195223
klass = index_or_series
196224

0 commit comments

Comments
 (0)