pandas-dev · mroeschke · Oct 22, 2023 · Sep 12, 2023 · Sep 12, 2023 · Sep 14, 2023
diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
@@ -13,10 +13,8 @@
 import operator
 
 import numba
-from numba.core import (
-    cgutils,
-    types,
-)
+from numba import types
+from numba.core import cgutils
 from numba.core.datamodel import models
 from numba.core.extending import (
     NativeValue,
@@ -40,7 +38,7 @@
 
 
 # TODO: Range index support
-# (not passing an index to series constructor doesn't work)
+# (this currently lowers OK, but does not round-trip)
 class IndexType(types.Type):
     """
     The type class for Index objects.
@@ -149,6 +147,7 @@ def typer(data, hashmap=None):
 @register_model(IndexType)
 class IndexModel(models.StructModel):
     def __init__(self, dmm, fe_type) -> None:
+        # We don't want the numpy string scalar type in our hashmap
         members = [
             ("data", fe_type.as_array),
             # This is an attempt to emulate our hashtable code with a numba
@@ -240,6 +239,25 @@ def index_impl(data):
     return context.compile_internal(builder, index_impl, sig, args)
 
 
+# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
+# (regular string)
+
+
+def maybe_cast_str(x):
+    # Dummy function that numba can overload
+    pass
+
+
+@overload(maybe_cast_str)
+def maybe_cast_str_impl(x):
+    """Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
+    Is a no-op for other types."""
+    if isinstance(x, types.UnicodeCharSeq):
+        return lambda x: str(x)
+    else:
+        return lambda x: x
+
+
 @unbox(IndexType)
 def unbox_index(typ, obj, c):
     """
@@ -426,8 +444,12 @@ def series_binop_impl(series1, value):
 series_reductions = [
     ("sum", np.sum),
     ("mean", np.mean),
-    ("std", np.std),
-    ("var", np.var),
+    # Disabled due to discrepancies between numba std. dev
+    # and pandas std. dev (no way to specify dof)
+    # ("std", np.std),
+    # ("var", np.var),
+    ("min", np.min),
+    ("max", np.max),
 ]
 for reduction, reduction_method in series_reductions:
     generate_series_reduction(reduction, reduction_method)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -1075,6 +1075,14 @@ def apply_series_generator(self) -> tuple[ResType, Index]:
         return results, res_index
 
     def apply_series_numba(self):
+        if self.engine_kwargs.get("parallel", False):
+            raise NotImplementedError(
+                "Parallel apply is not supported when raw=False and engine='numba'"
+            )
+        if not self.obj.index.is_unique or not self.columns.is_unique:
+            raise NotImplementedError(
+                "The index/columns must be unique when raw=False and engine='numba'"
+            )
         results = self.apply_with_numba()
         return results, self.result_index
 
@@ -1128,6 +1136,7 @@ def generate_numba_apply_func(
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
         from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        from pandas.core._numba.extensions import maybe_cast_str
 
         numba = import_optional_dependency("numba")
 
@@ -1138,7 +1147,9 @@ def numba_func(values, col_names, df_index):
             results = {}
             for j in range(values.shape[1]):
-            for j in range(values.shape[1]):
+            for j in numba.prange(values.shape[1]):
-            for j in range(values.shape[1]):
+            for j in numba.prange(values.shape[1]):
                 # Create the series
-                ser = Series(values[:, j], index=df_index, name=str(col_names[j]))
+                ser = Series(
+                    values[:, j], index=df_index, name=maybe_cast_str(col_names[j])
+                )
                 results[j] = jitted_udf(ser)
             return results
 
@@ -1148,23 +1159,40 @@ def apply_with_numba(self) -> dict[int, Any]:
         nb_func = self.generate_numba_apply_func(
             cast(Callable, self.func), **self.engine_kwargs
         )
-        orig_values = self.columns.to_numpy()
-        fixed_cols = False
-        if orig_values.dtype == object:
-            if not lib.is_string_array(orig_values):
+        # Since numpy/numba doesn't support object array of stringswell
+        # we'll do a sketchy thing where if index._data is object
+        # we convert to string and directly set index._data to that,
+        # setting it back after we call the function
+        fixed_obj_colnames = False
+        orig_cols = self.columns.to_numpy()
+        if self.columns._data.dtype == object:
+            if not lib.is_string_array(orig_cols):
                 raise ValueError(
                     "The numba engine only supports "
                     "using string or numeric column names"
                 )
-            col_names_values = orig_values.astype("U")
-            # Remember to set this back!
-            self.columns._data = col_names_values
-            fixed_cols = True
+            # Remember to set this back!!!
+            self.columns._data = orig_cols.astype("U")
+            fixed_obj_colnames = True
+
+        fixed_obj_index = False
+        orig_index = self.index.to_numpy()
+        if self.obj.index._data.dtype == object:
+            if not lib.is_string_array(orig_index):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric index values"
+                )
+            # Remember to set this back!!!
+            self.obj.index._data = orig_index.astype("U")
+            fixed_obj_index = True
         df_index = self.obj.index
 
         res = dict(nb_func(self.values, self.columns, df_index))
-        if fixed_cols:
-            self.columns._data = orig_values
+        if fixed_obj_colnames:
+            self.columns._data = orig_cols
+        if fixed_obj_index:
+            self.obj.index._data = orig_index
         return res
 
     @property
@@ -1260,6 +1288,7 @@ def generate_numba_apply_func(
         # using Series/DF in numba code outside of apply
         from pandas import Series
         from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        from pandas.core._numba.extensions import maybe_cast_str
 
         numba = import_optional_dependency("numba")
 
@@ -1271,7 +1300,11 @@ def numba_func(values, col_names_index, index):
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
-                ser = Series(values[i].copy(), index=col_names_index, name=index[i])
+                ser = Series(
+                    values[i].copy(),
+                    index=col_names_index,
+                    name=maybe_cast_str(index[i]),
+                )
                 results[i] = jitted_udf(ser)
 
             return results
@@ -1287,24 +1320,39 @@ def apply_with_numba(self) -> dict[int, Any]:
         # we'll do a sketchy thing where if index._data is object
         # we convert to string and directly set index._data to that,
         # setting it back after we call the function
-        fixed_obj_dtype = False
-        orig_data = self.columns.to_numpy()
+        fixed_obj_colnames = False
+        orig_cols = self.columns.to_numpy()
         if self.columns._data.dtype == object:
-            if not lib.is_string_array(orig_data):
+            if not lib.is_string_array(orig_cols):
                 raise ValueError(
                     "The numba engine only supports "
                     "using string or numeric column names"
                 )
             # Remember to set this back!!!
-            self.columns._data = orig_data.astype("U")
-            fixed_obj_dtype = True
+            self.columns._data = orig_cols.astype("U")
+            fixed_obj_colnames = True
+
+        fixed_obj_index = False
+        orig_index = self.index.to_numpy()
+        if self.obj.index._data.dtype == object:
+            if not lib.is_string_array(orig_index):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric index values"
+                )
+            # Remember to set this back!!!
+            self.obj.index._data = orig_index.astype("U")
+            fixed_obj_index = True
 
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
         res = dict(nb_func(self.values, self.columns, self.obj.index))
 
-        if fixed_obj_dtype:
-            self.columns._data = orig_data
+        if fixed_obj_colnames:
+            self.columns._data = orig_cols
+
+        if fixed_obj_index:
+            self.obj.index._data = orig_index
 
         return res
 

diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py
@@ -16,3 +16,15 @@ def int_frame_const_col():
         columns=["A", "B", "C"],
     )
     return df
+
+
+@pytest.fixture(params=["python", "numba"])
+def engine(request):
+    if request.param == "numba":
+        pytest.importorskip("numba")
+    return request.param
+
+
+@pytest.fixture(params=[0, 1])
+def apply_axis(request):
+    return request.param
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -18,13 +18,6 @@
 from pandas.tests.frame.common import zip_frames
 
 
-@pytest.fixture(params=["python", "numba"])
-def engine(request):
-    if request.param == "numba":
-        pytest.importorskip("numba")
-    return request.param
-
-
 def test_apply(float_frame, engine, request):
     if engine == "numba":
         mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet")
@@ -102,7 +95,7 @@ def test_apply_mixed_datetimelike():
 
 
 @pytest.mark.parametrize("func", [np.sqrt, np.mean])
-def test_apply_empty(func, engine=engine):
+def test_apply_empty(func, engine):
     # empty
     empty_frame = DataFrame()
 
@@ -983,15 +976,17 @@ def test_result_type_shorter_list(int_frame_const_col):
     tm.assert_frame_equal(result, expected)
 
 
-def test_result_type_broadcast(int_frame_const_col, request):
+def test_result_type_broadcast(int_frame_const_col, request, engine):
     # result_type should be consistent no matter which
     # path we take in the code
     if engine == "numba":
         mark = pytest.mark.xfail(reason="numba engine doesn't support list return")
         request.node.add_marker(mark)
     df = int_frame_const_col
     # broadcast result
-    result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast")
+    result = df.apply(
+        lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine
+    )
     expected = df.copy()
     tm.assert_frame_equal(result, expected)
 
@@ -1550,8 +1545,13 @@ def sum_div2(s):
     tm.assert_frame_equal(result, expected)
 
 
-def test_apply_getitem_axis_1(engine):
+def test_apply_getitem_axis_1(engine, request):
     # GH 13427
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine not supporting duplicate index values"
+        )
+        request.node.add_marker(mark)
     df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
     result = df[["a", "a"]].apply(
         lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine

diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
@@ -0,0 +1,64 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+def test_numba_vs_python_noop(float_frame, apply_axis):
+    func = lambda x: x
+    result = float_frame.apply(func, engine="numba", axis=apply_axis)
+    expected = float_frame.apply(func, engine="python", axis=apply_axis)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_numba_vs_python_indexing(float_frame):
+    row_func = lambda x: x["A"]
+    result = float_frame.apply(row_func, engine="numba", axis=1)
+    expected = float_frame.apply(row_func, engine="python", axis=1)
+    tm.assert_series_equal(result, expected)
+
+    row_func = lambda x: x["ZqgszYBfuL"]  # This is a label in the index
+    result = float_frame.apply(row_func, engine="numba", axis=0)
+    expected = float_frame.apply(row_func, engine="python", axis=0)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "reduction",
+    [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()],
+)
+def test_numba_vs_python_reductions(float_frame, reduction, apply_axis):
+    result = float_frame.apply(reduction, engine="numba", axis=apply_axis)
+    expected = float_frame.apply(reduction, engine="python", axis=apply_axis)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]])
+def test_numba_numeric_colnames(colnames):
+    # Check that numeric column names lower properly and can be indxed on
+    df = DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=colnames)
+    first_col = colnames[0]
+    f = lambda x: x[first_col]  # Get the first column
+    result = df.apply(f, engine="numba", axis=1)
+    expected = df.apply(f, engine="python", axis=1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_numba_parallel_unsupported(float_frame):
+    f = lambda x: x
+    with pytest.raises(
+        NotImplementedError,
+        match="Parallel apply is not supported when raw=False and engine='numba'",
+    ):
+        float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True})
+
+
+def test_numba_nonunique_unsupported():
+    f = lambda x: x
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    with pytest.raises(
+        NotImplementedError,
+        match="The index/columns must be unique when raw=False and engine='numba'",
+    ):
+        df.apply(f, engine="numba", engine_kwargs={"parallel": True})