pandas-dev · DataInformer · Apr 18, 2020 · Apr 18, 2020 · Apr 18, 2020 · Apr 18, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -921,6 +921,7 @@ Numeric
 - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
 - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
 - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
+- Bug in :meth:`Series.value_counts` with ``normalize=True`` for NA values (:issue:`25970`)
 
 Conversion
 ^^^^^^^^^^

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -693,12 +693,16 @@ def value_counts(
     ascending : bool, default False
         Sort in ascending order
     normalize: bool, default False
-        If True then compute a relative histogram
-    bins : integer, optional
-        Rather than count values, group them into half-open bins,
-        convenience for pd.cut, only works with numeric data
+        If True, then compute a relative histogram that outputs the
+        proportion of each value.
+    bins : integer or iterable of numeric, optional
+        Rather than count values, group them into half-open bins.
+        Only works with numeric data.
+        If int, interpreted as number of bins and will use pd.cut.
+        If interable of numeric, will use provided numbers as bin endpoints.
     dropna : bool, default True
-        Don't include counts of NaN
+        Don't include counts of NaN.
+        If False and NaNs are present, NaN will be a key in the output.
 
     Returns
     -------
@@ -717,18 +721,17 @@ def value_counts(
         except TypeError as err:
             raise TypeError("bins argument only works with numeric data.") from err
 
-        # count, remove nulls (from the index), and but the bins
+        # count, remove nulls (from the index), and use the bins
         result = ii.value_counts(dropna=dropna)
-        result = result[result.index.notna()]
         result.index = result.index.astype("interval")
         result = result.sort_index()
 
         # if we are dropna and we have NO values
         if dropna and (result._values == 0).all():
             result = result.iloc[0:0]
 
-        # normalizing is by len of all (regardless of dropna)
-        counts = np.array([len(ii)])
+        # normalizing is by len of what gets included in the bins
+        counts = result._values
 
     else:
 

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1180,11 +1180,14 @@ def value_counts(
             Sort by frequencies.
         ascending : bool, default False
             Sort in ascending order.
-        bins : int, optional
-            Rather than count values, group them into half-open bins,
-            a convenience for ``pd.cut``, only works with numeric data.
+        bins : int or iterable of numeric, optional
+            Rather than count individual values, group them into half-open bins.
+            Only works with numeric data.
+            If int, interpreted as number of bins and will use `pd.cut`.
+            If interable of numeric, will use provided numbers as bin endpoints.
         dropna : bool, default True
             Don't include counts of NaN.
+            If False and NaNs are present, NaN will be a key in the output.
 
         Returns
         -------
@@ -1230,6 +1233,15 @@ def value_counts(
         (3.0, 4.0]      1
         dtype: int64
 
+        Bins can also be an iterable of numbers.  These numbers are treated
+        as endpoints for the intervals.
+
+        >>> s.value_counts(bins=[0,2,4,9])
+        (2.0, 4.0]       3
+        (-0.001, 2.0]    2
+        (4.0, 9.0]       0
+        dtype: int64
+
         **dropna**
 
         With `dropna` set to `False` we can also see NaN index values.

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -191,6 +191,34 @@ def test_value_counts_bins(index_or_series):
     assert s.nunique() == 0
 
 
+def test_value_counts_bins_nas():
+    # GH25970, handle normalizing bins with NA's properly
+    # First test that NA's are included appropriately
+    rand_data = np.append(
+        np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20)
+    )
+    s = Series(rand_data)
+    assert s.value_counts(dropna=False).index.hasnans
+    assert not s.value_counts(dropna=True).index.hasnans
+    assert s.value_counts(dropna=False, bins=3).index.hasnans
+    assert not s.value_counts(dropna=True, bins=3).index.hasnans
+    assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans
+    assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans
+
+    # then verify specific example
+    s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5])
+    intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0])
+    expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2]))
+    expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2])
+    tm.assert_series_equal(
+        s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna
+    )
+    tm.assert_numpy_array_equal(
+        s2.value_counts(dropna=False, normalize=True, bins=3).values,
+        expected_keepna_vals,
+    )
+
+
 def test_value_counts_datetime64(index_or_series):
     klass = index_or_series
 

diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
@@ -53,10 +53,10 @@ def seed_df(seed_nans, n, m):
 @pytest.mark.slow
 @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
 @pytest.mark.parametrize("isort", [True, False])
-@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("normalize", [False])
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("ascending", [True, False])
-@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize("dropna", [True])
 def test_series_groupby_value_counts(
     df, keys, bins, n, m, isort, normalize, sort, ascending, dropna
 ):
@@ -71,6 +71,7 @@ def rebuild_index(df):
 
     gr = df.groupby(keys, sort=isort)
     left = gr["3rd"].value_counts(**kwargs)
+    # left.index.names = left.index.names[:-1] + ["3rd"]
 
     gr = df.groupby(keys, sort=isort)
     right = gr["3rd"].apply(Series.value_counts, **kwargs)