pandas-dev · jreback · May 19, 2020 · Mar 17, 2020 · Mar 17, 2020 · Mar 17, 2020
diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -97,6 +97,54 @@ def time_frame_op_with_series_axis0(self, opname):
         getattr(self.df, opname)(self.ser, axis=0)
 
 
+class FrameWithFrameWide:
+    # Many-columns, mixed dtypes
+
+    params = [
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.truediv,
+            operator.floordiv,
+            operator.pow,
+            operator.mod,
+            operator.eq,
+            operator.ne,
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+        ]
+    ]
+    param_names = ["op"]
+
+    def setup(self, op):
+        # we choose dtypes so as to make the blocks
+        #  a) not perfectly match between right and left
+        #  b) appreciably bigger than single columns
+        arr = np.random.randn(10 ** 6).reshape(500, 2000).astype(np.float64)
+        df = pd.DataFrame(arr)
+        df[1000] = df[1000].astype(np.float32)
+        df.iloc[:, 1000:] = df.iloc[:, 1000:].astype(np.float32)
+
+        # TODO: GH#33198 the setting here shoudlnt need two steps
+        df2 = pd.DataFrame(arr)
+        df2[1000] = df2[1000].astype(np.int64)
+        df2.iloc[:, 500:1500] = df2.iloc[:, 500:1500].astype(np.int64)
+
+        self.left = df
+        self.right = df
+
+    def time_op_different_blocks(self, op):
+        # blocks (and dtypes) are not aligned
+        op(self.left, self.right)
+
+    def time_op_same_blocks(self, op):
+        # blocks (and dtypes) are aligned
+        op(self.left, self.left)
+
+
 class Ops:
 
     params = [[True, False], ["default", 1]]

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -275,7 +275,8 @@ Performance improvements
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 - Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
-
+- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
+-
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1296,19 +1296,20 @@ def _addsub_object_array(self, other: np.ndarray, op):
         result : same class as self
         """
         assert op in [operator.add, operator.sub]
-        if len(other) == 1:
+        if len(other) == 1 and self.ndim == other.ndim == 1:
+            # If both 1D then broadcasting is unambiguous
             return op(self, other[0])
 
         warnings.warn(
-            "Adding/subtracting array of DateOffsets to "
+            "Adding/subtracting object-dtype array to "
             f"{type(self).__name__} not vectorized",
             PerformanceWarning,
         )
 
         # Caller is responsible for broadcasting if necessary
         assert self.shape == other.shape, (self.shape, other.shape)
 
-        res_values = op(self.astype("O"), np.array(other))
+        res_values = op(self.astype("O"), np.asarray(other))
         result = array(res_values.ravel())
         result = extract_array(result, extract_numpy=True).reshape(self.shape)
         return result

diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
@@ -4,7 +4,7 @@
 This is not a public API.
 """
 import operator
-from typing import TYPE_CHECKING, Optional, Set
+from typing import TYPE_CHECKING, List, Optional, Set, Tuple
 
 import numpy as np
 
@@ -57,6 +57,7 @@
 
 if TYPE_CHECKING:
     from pandas import DataFrame  # noqa:F401
+    from pandas.core.internals.blocks import Block  # noqa: F401
 
 # -----------------------------------------------------------------------------
 # constants
@@ -293,6 +294,85 @@ def fill_binop(left, right, fill_value):
 # Dispatch logic
 
 
+def operate_blockwise(left, right, array_op):
+    assert right._indexed_same(left)
+
+    def get_same_shape_values(
+        lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool
+    ) -> Tuple[ArrayLike, ArrayLike]:
+        """
+        Slice lblk.values to align with rblk.  Squeeze if we have EAs.
+        """
+        lvals = lblk.values
+        rvals = rblk.values
+
+        # TODO(EA2D): with 2D EAs pnly this first clause would be needed
+        if not (left_ea or right_ea):
+            lvals = lvals[rblk.mgr_locs.indexer, :]
+            assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
+        elif left_ea and right_ea:
+            assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
+        elif right_ea:
+            # lvals are 2D, rvals are 1D
+            lvals = lvals[rblk.mgr_locs.indexer, :]
+            assert lvals.shape[0] == 1, lvals.shape
+            lvals = lvals[0, :]
+        else:
+            # lvals are 1D, rvals are 2D
+            assert rvals.shape[0] == 1, rvals.shape
+            rvals = rvals[0, :]
+
+        return lvals, rvals
+
+    res_blks: List["Block"] = []
+    rmgr = right._data
+    for n, blk in enumerate(left._data.blocks):
+        locs = blk.mgr_locs
+        blk_vals = blk.values
+
+        left_ea = not isinstance(blk_vals, np.ndarray)
+
+        rblks = rmgr._slice_take_blocks_ax0(locs.indexer)
+
+        if left_ea:
+            assert len(locs) == 1, locs
+            assert len(rblks) == 1, rblks
+            assert rblks[0].shape[0] == 1, rblks[0].shape
+
+        for k, rblk in enumerate(rblks):
+            right_ea = not isinstance(rblk.values, np.ndarray)
+
+            lvals, rvals = get_same_shape_values(blk, rblk, left_ea, right_ea)
+
+            res_values = array_op(lvals, rvals)
+            nbs = rblk._split_op_result(res_values)
+
+            if right_ea or left_ea:
+                assert len(nbs) == 1
+            else:
+                assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape)
+
+            for nb in nbs:
+                # Reset mgr_locs to correspond to our original DataFrame
+                nblocs = locs.as_array[nb.mgr_locs.indexer]
+                nb.mgr_locs = nblocs
+                # Assertions are disabled for performance, but should hold:
+                #  assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape)
+                #  assert all(x in locs.as_array for x in nb.mgr_locs.as_array)
+
+            res_blks.extend(nbs)
+
+    # Assertions are disabled for performance, but should hold:
+    #  slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array}
+    #  nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks)
+    #  assert nlocs == len(left.columns), (nlocs, len(left.columns))
+    #  assert len(slocs) == nlocs, (len(slocs), nlocs)
+    #  assert slocs == set(range(nlocs)), slocs
+
+    new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False)
+    return new_mgr
+
+
 def dispatch_to_series(left, right, func, str_rep=None, axis=None):
     """
     Evaluate the frame operation func(left, right) by evaluating
@@ -325,8 +405,9 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
     elif isinstance(right, ABCDataFrame):
         assert right._indexed_same(left)
 
-        def column_op(a, b):
-            return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))}
+        array_op = get_array_op(func, str_rep=str_rep)
+        bm = operate_blockwise(left, right, array_op)
+        return type(left)(bm)
 
     elif isinstance(right, ABCSeries) and axis == "columns":
         # We only get here if called via _combine_series_frame,

diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
@@ -127,7 +127,7 @@ def masked_arith_op(x: np.ndarray, y, op):
     return result
 
 
-def define_na_arithmetic_op(op, str_rep: str):
+def define_na_arithmetic_op(op, str_rep: Optional[str]):
     def na_op(x, y):
         return na_arithmetic_op(x, y, op, str_rep)
 

diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py
@@ -70,7 +70,14 @@ def assert_invalid_comparison(left, right, box):
     result = right != left
     tm.assert_equal(result, ~expected)
 
-    msg = "Invalid comparison between|Cannot compare type|not supported between"
+    msg = "|".join(
+        [
+            "Invalid comparison between",
+            "Cannot compare type",
+            "not supported between",
+            "invalid type promotion",
+        ]
+    )
     with pytest.raises(TypeError, match=msg):
         left < right
     with pytest.raises(TypeError, match=msg):

diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py
@@ -964,7 +964,9 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture):
         obj = tm.box_expected(dti, box_with_array)
         expected = tm.box_expected(expected, box_with_array)
 
-        warn = PerformanceWarning if box_with_array is not pd.DataFrame else None
+        warn = None
+        if box_with_array is not pd.DataFrame or tz_naive_fixture is None:
+            warn = PerformanceWarning
         with tm.assert_produces_warning(warn):
             result = obj - obj.astype(object)
         tm.assert_equal(result, expected)
@@ -1388,8 +1390,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array):
         s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")])
         s = tm.box_expected(s, box_with_array)
 
-        warn = None if box_with_array is pd.DataFrame else PerformanceWarning
-        with tm.assert_produces_warning(warn):
+        with tm.assert_produces_warning(PerformanceWarning):
             other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()])
             other = tm.box_expected(other, box_with_array)
             result = s + other

diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -49,9 +49,11 @@ def check(df, df2):
                 )
                 tm.assert_frame_equal(result, expected)
 
-                msg = re.escape(
-                    "Invalid comparison between dtype=datetime64[ns] and ndarray"
-                )
+                msgs = [
+                    r"Invalid comparison between dtype=datetime64\[ns\] and ndarray",
+                    "invalid type promotion",
+                ]
+                msg = "|".join(msgs)
                 with pytest.raises(TypeError, match=msg):
                     x >= y
                 with pytest.raises(TypeError, match=msg):