pandas-dev · mroeschke · Oct 4, 2022 · Sep 27, 2022 · Sep 27, 2022 · Sep 28, 2022
diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst
@@ -78,6 +78,7 @@ Fixed regressions
 - Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`)
 - Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`)
 - Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`)
+- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -619,8 +619,8 @@ def factorize(
                 na_mask = indices.values == -1
                 na_index = na_mask.argmax()
                 if na_mask[na_index]:
-                    uniques = uniques.insert(na_index, self.dtype.na_value)
-                    na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
+                    na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
+                    uniques = uniques.insert(na_code, self.dtype.na_value)
                     indices[indices >= na_code] += 1
                     indices[indices == -1] = na_code
         else:

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -913,7 +913,7 @@ def factorize(
             else:
                 # mypy error: Slice index must be an integer or None
                 # https://github.com/python/mypy/issues/2410
-                na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
+                na_code = codes[:na_index].max() + 1  # type: ignore[misc]
             codes[codes >= na_code] += 1
             codes[codes == -1] = na_code
             # dummy value for uniques; not used since uniques_mask will be True

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -393,75 +393,86 @@ def test_groupby_drop_nan_with_multi_index():
     tm.assert_frame_equal(result, expected)
 
 
+# Test all combinations of values e.g. 1, 2, and NA. Use string labels to
+# correspond to various dtypes. "z" always corresponds to NA.
+@pytest.mark.parametrize("sequence0", ["x", "y", "z"])
+@pytest.mark.parametrize("sequence1", ["x", "y", "z"])
+@pytest.mark.parametrize("sequence2", ["x", "y", "z"])
+@pytest.mark.parametrize("sequence3", ["x", "y", "z"])
 @pytest.mark.parametrize(
-    "values, dtype",
+    "uniques, dtype",
     [
-        ([2, np.nan, 1, 2], None),
-        ([2, np.nan, 1, 2], "UInt8"),
-        ([2, np.nan, 1, 2], "Int8"),
-        ([2, np.nan, 1, 2], "UInt16"),
-        ([2, np.nan, 1, 2], "Int16"),
-        ([2, np.nan, 1, 2], "UInt32"),
-        ([2, np.nan, 1, 2], "Int32"),
-        ([2, np.nan, 1, 2], "UInt64"),
-        ([2, np.nan, 1, 2], "Int64"),
-        ([2, np.nan, 1, 2], "Float32"),
-        ([2, np.nan, 1, 2], "Int64"),
-        ([2, np.nan, 1, 2], "Float64"),
+        ({"x": 1, "y": 2, "z": np.nan}, None),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt8"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int8"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt16"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int16"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt32"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int32"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt64"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Float32"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Float64"),
+        ({"x": "x", "y": "y", "z": None}, "category"),
+        ({"x": "x", "y": "y", "z": pd.NA}, "string"),
         pytest.param(
-            ["y", None, "x", "y"],
-            "category",
-            marks=pytest.mark.xfail(
-                reason="dropna=False not correct for categorical, GH#48645"
-            ),
-        ),
-        (["y", pd.NA, "x", "y"], "string"),
-        pytest.param(
-            ["y", pd.NA, "x", "y"],
+            {"x": "x", "y": "y", "z": pd.NA},
             "string[pyarrow]",
             marks=pytest.mark.skipif(
                 pa_version_under1p01, reason="pyarrow is not installed"
             ),
         ),
         (
-            ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
+            {"x": "2016-01-01", "y": "2017-01-01", "z": np.datetime64("NaT")},
             "datetime64[ns]",
         ),
         (
-            [
-                pd.Period("2012-02-01", freq="D"),
-                pd.NaT,
-                pd.Period("2012-01-01", freq="D"),
-                pd.Period("2012-02-01", freq="D"),
-            ],
+            {
+                "x": pd.Period("2012-01-01", freq="D"),
+                "y": pd.Period("2012-02-01", freq="D"),
+                "z": pd.NaT,
+            },
             None,
         ),
-        (pd.arrays.SparseArray([2, np.nan, 1, 2]), None),
     ],
 )
-@pytest.mark.parametrize("test_series", [True, False])
-def test_no_sort_keep_na(values, dtype, test_series):
-    # GH#46584
-    key = pd.Series(values, dtype=dtype)
-    df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
+def test_no_sort_keep_na(
+    request, sequence0, sequence1, sequence2, sequence3, uniques, dtype
+):
+    # GH#46584, GH#48794
+    sequence = "".join([sequence0, sequence1, sequence2, sequence3])
+    if dtype == "category" and "z" in sequence:
+        # Only xfail when nulls are present
+        msg = "dropna=False not correct for categorical, GH#48645"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    if dtype == "datetime64[ns]" and sequence == "zzzz":
+        msg = "Cannot construct datetime of all nulls"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    weights = {"x": 1, "y": 2, "z": 3}
+
+    key = pd.Series([uniques[label] for label in sequence], dtype=dtype)
+    df = pd.DataFrame({"key": key, "a": [weights[label] for label in sequence]})
     gb = df.groupby("key", dropna=False, sort=False)
-    if test_series:
-        gb = gb["a"]
-
-    warn = None
-    if isinstance(values, pd.arrays.SparseArray):
-        warn = FutureWarning
-    msg = "passing a SparseArray to pd.Index will store that array directly"
-    with tm.assert_produces_warning(warn, match=msg):
-        result = gb.sum()
-        expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
 
-    if test_series:
-        expected = expected["a"]
-    if expected.index.is_categorical():
-        # TODO: Slicing reorders categories?
-        expected.index = expected.index.reorder_categories(["y", "x"])
-    tm.assert_equal(result, expected)
+    result = gb.sum()
+    # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
+    # issues with hashing np.nan
+    summed = {}
+    for label in sequence:
+        summed[label] = summed.get(label, 0) + weights[label]
+
+    if dtype == "category":
+        index = pd.CategoricalIndex(
+            [uniques[label] for label in summed],
+            # Get the nonnull categories in the order they appear ignoring duplicates
+            list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}),
+            name="key",
+        )
+    else:
+        index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
+    expected = pd.Series(summed.values(), index=index, name="a", dtype=None).to_frame()
+    tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.parametrize("test_series", [True, False])