include nan count when dropna=False

DataInformer · DataInformer · commit f5e9aeb08a60 · 2020-06-27T15:57:09.000-04:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -693,12 +693,16 @@ def value_counts(
     ascending : bool, default False
         Sort in ascending order
     normalize: bool, default False
-        If True then compute a relative histogram
-    bins : integer, optional
-        Rather than count values, group them into half-open bins,
-        convenience for pd.cut, only works with numeric data
+        If True, then compute a relative histogram that outputs the
+        proportion of each value.
+    bins : integer or iterable of numeric, optional
+        Rather than count values, group them into half-open bins.
+        Only works with numeric data.
+        If int, interpreted as number of bins and will use pd.cut.
+        If interable of numeric, will use provided numbers as bin endpoints.
     dropna : bool, default True
-        Don't include counts of NaN
+        Don't include counts of NaN.
+        If False and NaNs are present, NaN will be a key in the output.
 
     Returns
     -------
@@ -717,18 +721,17 @@ def value_counts(
         except TypeError as err:
             raise TypeError("bins argument only works with numeric data.") from err
 
-        # count, remove nulls (from the index), and but the bins
+        # count, remove nulls (from the index), and use the bins
         result = ii.value_counts(dropna=dropna)
-        result = result[result.index.notna()]
         result.index = result.index.astype("interval")
         result = result.sort_index()
 
         # if we are dropna and we have NO values
         if dropna and (result._values == 0).all():
             result = result.iloc[0:0]
 
-        # normalizing is by len of all (regardless of dropna)
-        counts = np.array([len(ii)])
+        # normalizing is by len of what gets included in the bins
+        counts = result._values
 
     else:
 
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1180,11 +1180,14 @@ def value_counts(
             Sort by frequencies.
         ascending : bool, default False
             Sort in ascending order.
-        bins : int, optional
-            Rather than count values, group them into half-open bins,
-            a convenience for ``pd.cut``, only works with numeric data.
+        bins : integer or iterable of numeric, optional
+            Rather than count individual values, group them into half-open bins.
+            Only works with numeric data.
+            If int, interpreted as number of bins and will use `pd.cut`.
+            If interable of numeric, will use provided numbers as bin endpoints.
         dropna : bool, default True
             Don't include counts of NaN.
+            If False and NaNs are present, NaN will be a key in the output.
 
         Returns
         -------
@@ -1230,6 +1233,13 @@ def value_counts(
         (3.0, 4.0]      1
         dtype: int64
 
+        Bins can also be an iterable of numbers.  These numbers are treated
+        as endpoints for the intervals.
+        >>> s.value_counts(bins=[0,2,4,9])
+        (2.0, 4.0]      3
+        (-0.001, 2.0]    2
+        (4.0, 9.0]       0
+        dtype: int64
         **dropna**
 
         With `dropna` set to `False` we can also see NaN index values.
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -191,6 +191,34 @@ def test_value_counts_bins(index_or_series):
     assert s.nunique() == 0
 
 
+def test_value_counts_bins_nas():
+    # GH25970, handle normalizing bins with NA's properly
+    # First test that NA's are included appropriately
+    rand_data = np.append(
+        np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20)
+    )
+    s = Series(rand_data)
+    assert s.value_counts(dropna=False).index.hasnans
+    assert not s.value_counts(dropna=True).index.hasnans
+    assert s.value_counts(dropna=False, bins=3).index.hasnans
+    assert not s.value_counts(dropna=True, bins=3).index.hasnans
+    assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans
+    assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans
+
+    # then verify specific example
+    s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5])
+    intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0])
+    expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2]))
+    expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2])
+    tm.assert_series_equal(
+        s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna
+    )
+    tm.assert_numpy_array_equal(
+        s2.value_counts(dropna=False, normalize=True, bins=3).values,
+        expected_keepna_vals,
+    )
+
+
 def test_value_counts_datetime64(index_or_series):
     klass = index_or_series