Refactored _get_cythonized_result

rhshadrach · rhshadrach · commit f2bb7a952f92 · 2020-05-27T17:20:32.000-04:00
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -718,7 +718,6 @@ def group_quantile(floating[:, :] out,
                    int64_t[:] counts,
                    floating[:, :] values,
                    const int64_t[:] labels,
-                   Py_ssize_t min_count,
                    const uint8_t[:, :] mask,
                    float64_t q,
                    object interpolation):
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2059,6 +2059,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
             return self._get_cythonized_result(
                 "group_quantile",
                 aggregate=True,
+                needs_counts=True,
                 needs_values=True,
                 needs_mask=True,
                 cython_dtype=np.dtype(np.float64),
@@ -2072,6 +2073,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
                 self._get_cythonized_result(
                     "group_quantile",
                     aggregate=True,
+                    needs_counts=True,
                     needs_values=True,
                     needs_mask=True,
                     cython_dtype=np.dtype(np.float64),
@@ -2348,9 +2350,10 @@ def _get_cythonized_result(
         how: str,
         cython_dtype: np.dtype,
         aggregate: bool = False,
+        needs_counts: bool = False,
         needs_values: bool = False,
+        min_count: Optional[int] = None,
         needs_mask: bool = False,
-        needs_ngroups: bool = False,
         result_is_index: bool = False,
         pre_processing=None,
         post_processing=None,
@@ -2367,14 +2370,16 @@ def _get_cythonized_result(
         aggregate : bool, default False
             Whether the result should be aggregated to match the number of
             groups
+        needs_counts : bool, default False
+            Whether the counts should be a part of the Cython call
         needs_values : bool, default False
             Whether the values should be a part of the Cython call
             signature
+        min_count : int, default None
+            When not None, min_count for the Cython call
         needs_mask : bool, default False
             Whether boolean mask needs to be part of the Cython call
             signature
-        needs_ngroups : bool, default False
-            Whether number of groups is part of the Cython call signature
         result_is_index : bool, default False
             Whether the result of the Cython operation is an index of
             values to be retrieved, instead of the actual values themselves
@@ -2414,74 +2419,63 @@ def _get_cythonized_result(
         labels, _, ngroups = grouper.group_info
         output: Dict[base.OutputKey, np.ndarray] = {}
         base_func = getattr(libgroupby, how)
+        inferences = None
 
-        if how == "group_quantile":
-            values = self._obj_with_exclusions._values
-            result_sz = ngroups if aggregate else len(values)
+        values = self._obj_with_exclusions._values
+        result_sz = ngroups if aggregate else len(values)
+        if self._obj_with_exclusions.ndim == 1:
+            width = 1
+        else:
+            width = len(self._obj_with_exclusions.columns)
+        result = np.zeros((result_sz, width), dtype=cython_dtype)
+        func = partial(base_func, result)
 
-            vals, inferences = pre_processing(values)
-            if self._obj_with_exclusions.ndim == 1:
-                width = 1
-                vals = np.reshape(vals, (-1, 1))
-            else:
-                width = len(self._obj_with_exclusions.columns)
-            result = np.zeros((result_sz, width), dtype=cython_dtype)
+        if needs_counts:
             counts = np.zeros(self.ngroups, dtype=np.int64)
-            mask = isna(vals).view(np.uint8)
-
-            func = partial(base_func, result, counts, vals, labels, -1, mask)
-            func(**kwargs)  # Call func to modify indexer values in place
-            result = post_processing(result, inferences)
+            func = partial(func, counts)
 
+        if needs_values:
+            vals = values
+            if pre_processing:
+                vals, inferences = pre_processing(vals)
             if self._obj_with_exclusions.ndim == 1:
-                key = base.OutputKey(label=self._obj_with_exclusions.name, position=0)
-                output[key] = result[:, 0]
-            else:
-                for idx, name in enumerate(self._obj_with_exclusions.columns):
-                    key = base.OutputKey(label=name, position=idx)
-                    output[key] = result[:, idx]
+                vals = np.reshape(vals, (-1, 1))
+            func = partial(func, vals)
 
-            if aggregate:
-                return self._wrap_aggregated_output(output)
-            else:
-                return self._wrap_transformed_output(output)
+        # Groupby always needs labels
+        func = partial(func, labels)
 
-        for idx, obj in enumerate(self._iterate_slices()):
-            name = obj.name
-            values = obj._values
+        if min_count is not None:
+            func = partial(func, min_count)
 
-            if aggregate:
-                result_sz = ngroups
+        if needs_mask:
+            if self._obj_with_exclusions.ndim == 1:
+                # If needs_values is True, don't need to reshape again
+                if needs_values:
+                    mask = isna(vals).view(np.uint8)
+                else:
+                    mask = isna(np.reshape(values, (-1, 1))).view(np.uint8)
             else:
-                result_sz = len(values)
-
-            result = np.zeros(result_sz, dtype=cython_dtype)
-            func = partial(base_func, result, labels)
-            inferences = None
-
-            if needs_values:
-                vals = values
-                if pre_processing:
-                    vals, inferences = pre_processing(vals)
-                func = partial(func, vals)
-
-            if needs_mask:
                 mask = isna(values).view(np.uint8)
-                func = partial(func, mask)
-
-            if needs_ngroups:
-                func = partial(func, ngroups)
+            func = partial(func, mask)
 
-            func(**kwargs)  # Call func to modify indexer values in place
+        func(**kwargs)  # Call func to modify indexer values in place
 
-            if result_is_index:
-                result = algorithms.take_nd(values, result)
+        # TODO: Probably not correct
+        if result_is_index:
+            result = algorithms.take_nd(values, result)
 
-            if post_processing:
-                result = post_processing(result, inferences)
+        if post_processing:
+            result = post_processing(result, inferences)
 
-            key = base.OutputKey(label=name, position=idx)
-            output[key] = result
+        # TODO: Perhaps there is a better way to get result into output
+        if self._obj_with_exclusions.ndim == 1:
+            key = base.OutputKey(label=self._obj_with_exclusions.name, position=0)
+            output[key] = result[:, 0]
+        else:
+            for idx, name in enumerate(self._obj_with_exclusions.columns):
+                key = base.OutputKey(label=name, position=idx)
+                output[key] = result[:, idx]
 
         if aggregate:
             return self._wrap_aggregated_output(output)