Support channels_last format in portable upsample kernels (pytorch#9526)

GregoryComer · GregoryComer · commit e2f9e3b9b02f · 2025-03-22T23:38:12.000-07:00
Summary:

Support channels_last input format in portable CPU upsample_bilinear2d and upsample_nearest2d kernels. This is useful for resize-in-model patterns when the user wants to pass inputs in channels_last format. It also (theoretically) allows for more effective auto-vectorization when vectorizing along the channels dim when there are a larger number of channels.

I considered generalizing the kernel to handle arbitrary dim order, but having a specialized channels last version allows for traversing the output in contiguous order. I could add a separate, arbitrarily-strided variant, but we can take that as a follow-up if needed.

To accomplish this, this PR makes the following changes:
- Update `check_upsample_2d_common_args` to relax the dim order restriction. It now allows for both default and channels_last dim order and verifies that the output dim order matches the input.
- In the upsample kernels (bilinear and nearest), split out NCHW and NHWC variants. The NHWC variant interchanges the loop order as to maintain contiguous output accesses.
- Add test coverage to ensure ATen numerical parity.

Differential Revision: D71690379
diff --git a/kernels/portable/cpu/op_upsample_bilinear2d.cpp b/kernels/portable/cpu/op_upsample_bilinear2d.cpp
@@ -20,7 +20,7 @@ using executorch::aten::SizesType;
 
 namespace {
 template <typename CTYPE>
-void upsample_bilinear2d_kernel_impl(
+void upsample_bilinear2d_kernel_impl_nchw(
     const Tensor& in,
     bool align_corners,
     const float scale_h,
@@ -86,6 +86,75 @@ void upsample_bilinear2d_kernel_impl(
     }
   }
 }
+
+template <typename CTYPE>
+void upsample_bilinear2d_kernel_impl_nhwc(
+    const Tensor& in,
+    bool align_corners,
+    const float scale_h,
+    const float scale_w,
+    Tensor& out) {
+  const auto in_data = in.const_data_ptr<CTYPE>();
+  auto out_data = out.mutable_data_ptr<CTYPE>();
+
+  for ([[maybe_unused]] const auto n : c10::irange(out.size(0))) {
+    for (const auto h : c10::irange(out.size(2))) {
+      // Compute source index and weights.
+      int64_t in_h1, in_h2;
+      float weight_h, inv_weight_h;
+
+      compute_source_index_and_lambda(
+          in_h1,
+          in_h2,
+          weight_h,
+          inv_weight_h,
+          scale_h,
+          h,
+          in.sizes()[2],
+          out.sizes()[2],
+          align_corners);
+
+      for (const auto w : c10::irange(out.size(3))) {
+        int64_t in_w1, in_w2;
+        float weight_w, inv_weight_w;
+
+        compute_source_index_and_lambda(
+            in_w1,
+            in_w2,
+            weight_w,
+            inv_weight_w,
+            scale_w,
+            w,
+            in.sizes()[3],
+            out.sizes()[3],
+            align_corners);
+
+        for ([[maybe_unused]] const auto c : c10::irange(out.size(1))) {
+          const auto top_left = in_data
+              [in_h1 * in.strides()[2] + in_w1 * in.strides()[3] +
+               c * in.strides()[1]];
+          const auto top_right = in_data
+              [in_h1 * in.strides()[2] + in_w2 * in.strides()[3] +
+               c * in.strides()[1]];
+          const auto bottom_left = in_data
+              [in_h2 * in.strides()[2] + in_w1 * in.strides()[3] +
+               c * in.strides()[1]];
+          const auto bottom_right = in_data
+              [in_h2 * in.strides()[2] + in_w2 * in.strides()[3] +
+               c * in.strides()[1]];
+
+          const auto top = top_left * weight_w + top_right * inv_weight_w;
+          const auto bottom =
+              bottom_left * weight_w + bottom_right * inv_weight_w;
+          const auto val = top * weight_h + bottom * inv_weight_h;
+
+          *out_data = val;
+          out_data++;
+        }
+      }
+    }
+  }
+}
 } // namespace
 
 // Signatures are auto-generated, so disable pass-by-value lint.
@@ -101,7 +170,7 @@ Tensor& upsample_bilinear2d_vec_out(
   // Preconditions (checked in check_..._args):
   //  In and out tensors have same dtype.
   //  In and out tensors are rank 4 and have same dim[0] and dim[1].
-  //  In and out tensors are default dim order (NCHW).
+  //  In and out tensors are NHWC or NCHW dim order.
   ET_KERNEL_CHECK(
       ctx,
       check_upsample_bilinear2d_args(
@@ -124,11 +193,24 @@ Tensor& upsample_bilinear2d_vec_out(
   const auto kernel_scale_w = area_pixel_compute_scale<double>(
       in.sizes()[3], out.sizes()[3], align_corners, scale_w);
 
-  ET_SWITCH_REALHBF16_TYPES(
-      in.scalar_type(), ctx, "upsample_bilinear2d.out", CTYPE, [&]() {
-        upsample_bilinear2d_kernel_impl<CTYPE>(
-            in, align_corners, kernel_scale_h, kernel_scale_w, out);
-      });
+  if (executorch::runtime::tensor_is_default_dim_order(in)) {
+    ET_SWITCH_REALHBF16_TYPES(
+        in.scalar_type(), ctx, "upsample_bilinear2d.out", CTYPE, [&]() {
+          upsample_bilinear2d_kernel_impl_nchw<CTYPE>(
+              in, align_corners, kernel_scale_h, kernel_scale_w, out);
+        });
+  } else if (executorch::runtime::tensor_is_channels_last_dim_order(in)) {
+    ET_SWITCH_REALHBF16_TYPES(
+        in.scalar_type(), ctx, "upsample_bilinear2d.out", CTYPE, [&]() {
+          upsample_bilinear2d_kernel_impl_nhwc<CTYPE>(
+              in, align_corners, kernel_scale_h, kernel_scale_w, out);
+        });
+  } else {
+    // Shouldn't be reachable because of args checks, but just in case.
+    ET_LOG(Error, "Unsupported dim order");
+    ctx.fail(Error::InvalidArgument);
+    return out;
+  }
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_upsample_nearest2d.cpp b/kernels/portable/cpu/op_upsample_nearest2d.cpp
@@ -19,7 +19,7 @@ using executorch::aten::SizesType;
 
 namespace {
 template <typename CTYPE>
-void upsample_nearest2d_kernel_impl(
+void upsample_nearest2d_kernel_impl_nchw(
     const Tensor& in,
     const float scale_h,
     const float scale_w,
@@ -46,6 +46,33 @@ void upsample_nearest2d_kernel_impl(
     }
   }
 }
+
+template <typename CTYPE>
+void upsample_nearest2d_kernel_impl_nhwc(
+    const Tensor& in,
+    const float scale_h,
+    const float scale_w,
+    Tensor& out) {
+  const auto in_data = in.const_data_ptr<CTYPE>();
+  auto out_data = out.mutable_data_ptr<CTYPE>();
+
+  for (auto n = 0; n < out.size(0); n++) {
+    for (auto h = 0; h < out.size(2); h++) {
+      const auto in_h =
+          nearest_neighbor_compute_source_index(scale_h, h, in.sizes()[2]);
+      for (auto w = 0; w < out.size(3); w++) {
+        const auto in_w =
+            nearest_neighbor_compute_source_index(scale_w, w, in.sizes()[3]);
+        for (auto c = 0; c < out.size(1); c++) {
+          *out_data = in_data
+              [in_h * in.strides()[2] + in_w * in.strides()[3] +
+               c * in.strides()[1]];
+          out_data++;
+        }
+      }
+    }
+  }
+}
 } // namespace
 
 Tensor& upsample_nearest2d_vec_out(
@@ -79,11 +106,24 @@ Tensor& upsample_nearest2d_vec_out(
   const auto kernel_scale_w = area_pixel_compute_scale<double>(
       in.sizes()[3], out.sizes()[3], false, scale_w);
 
-  ET_SWITCH_REALHBF16_TYPES(
-      in.scalar_type(), ctx, "upsample_nearest2d.out", CTYPE, [&]() {
-        upsample_nearest2d_kernel_impl<CTYPE>(
-            in, kernel_scale_h, kernel_scale_w, out);
-      });
+  if (tensor_is_default_dim_order(in)) {
+    ET_SWITCH_REALHBF16_TYPES(
+        in.scalar_type(), ctx, "upsample_nearest2d.out", CTYPE, [&]() {
+          upsample_nearest2d_kernel_impl_nchw<CTYPE>(
+              in, kernel_scale_h, kernel_scale_w, out);
+        });
+  } else if (executorch::runtime::tensor_is_channels_last_dim_order(in)) {
+    ET_SWITCH_REALHBF16_TYPES(
+        in.scalar_type(), ctx, "upsample_nearest2d.out", CTYPE, [&]() {
+          upsample_nearest2d_kernel_impl_nhwc<CTYPE>(
+              in, kernel_scale_h, kernel_scale_w, out);
+        });
+  } else {
+    // Shouldn't be reachable because of args checks, but just in case.
+    ET_LOG(Error, "Unsupported dim order");
+    ctx.fail(Error::InvalidArgument);
+    return out;
+  }
 
   return out;
 }
diff --git a/kernels/portable/cpu/util/upsample_util.cpp b/kernels/portable/cpu/util/upsample_util.cpp
@@ -18,10 +18,11 @@ bool check_upsample_2d_common_args(
     const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dim_order(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(in.dim() == 4);
   ET_LOG_AND_RETURN_IF_FALSE(out.dim() == 4);
-  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_dim_order(in));
-  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_dim_order(out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(in));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(out));
   ET_LOG_AND_RETURN_IF_FALSE(
       output_size.has_value() ^ scale_factors.has_value());
   if (scale_factors.has_value()) {
diff --git a/kernels/portable/test/op_upsample_bilinear2d_test.py b/kernels/portable/test/op_upsample_bilinear2d_test.py
@@ -63,6 +63,26 @@ def test_upsample_bilinear2d_aten_parity_f32(self):
                 input, scale_factors=(out_h / h, out_w / w), align_corners=align_corners
             )
 
+    def test_upsample_bilinear2d_aten_parity_f32_channels_last(self):
+        N = [1, 2]
+        C = [1, 3]
+        H = [1, 3, 50, 1001]
+        W = [1, 2, 62, 1237]
+        OUT_H = [5, 21]
+        OUT_W = [7, 31]
+        ALIGN_CORNERS = [True, False]
+
+        for n, c, h, w, out_h, out_w, align_corners in itertools.product(
+            N, C, H, W, OUT_H, OUT_W, ALIGN_CORNERS
+        ):
+            input = torch.randn(n, c, h, w).to(memory_format=torch.channels_last)
+            self.run_upsample_test(
+                input, output_size=(out_h, out_w), align_corners=align_corners
+            )
+            self.run_upsample_test(
+                input, scale_factors=(out_h / h, out_w / w), align_corners=align_corners
+            )
+
     def test_upsample_bilinear2d_aten_parity_u8(self):
         N = [1, 2]
         C = [1, 3]
@@ -85,3 +105,28 @@ def test_upsample_bilinear2d_aten_parity_u8(self):
                 align_corners=align_corners,
                 atol=2,
             )
+
+    def test_upsample_bilinear2d_aten_parity_u8_channels_last(self):
+        N = [1, 2]
+        C = [1, 3]
+        H = [1, 3, 50, 1001]
+        W = [1, 2, 62, 1237]
+        OUT_H = [5, 21]
+        OUT_W = [7, 31]
+        ALIGN_CORNERS = [True, False]
+
+        for n, c, h, w, out_h, out_w, align_corners in itertools.product(
+            N, C, H, W, OUT_H, OUT_W, ALIGN_CORNERS
+        ):
+            input = torch.randint(0, 255, (n, c, h, w), dtype=torch.uint8).to(
+                memory_format=torch.channels_last
+            )
+            self.run_upsample_test(
+                input, output_size=(out_h, out_w), align_corners=align_corners, atol=2
+            )
+            self.run_upsample_test(
+                input,
+                scale_factors=(out_h / h, out_w / w),
+                align_corners=align_corners,
+                atol=2,
+            )
diff --git a/kernels/portable/test/op_upsample_nearest2d_test.py b/kernels/portable/test/op_upsample_nearest2d_test.py
@@ -69,3 +69,35 @@ def test_upsample_nearest2d_aten_parity_u8(self):
                 scale_factors=(out_h / h, out_w / w),
                 atol=2,
             )
+
+    def test_upsample_nearest2d_aten_parity_f32_channels_last(self):
+        N = [1, 2]
+        C = [1, 3]
+        H = [1, 3, 50, 1001]
+        W = [1, 2, 62, 1237]
+        OUT_H = [5, 21]
+        OUT_W = [7, 31]
+
+        for n, c, h, w, out_h, out_w in itertools.product(N, C, H, W, OUT_H, OUT_W):
+            input = torch.randn(n, c, h, w).to(memory_format=torch.channels_last)
+            self.run_upsample_test(input, output_size=(out_h, out_w))
+            self.run_upsample_test(input, scale_factors=(out_h / h, out_w / w))
+
+    def test_upsample_nearest2d_aten_parity_u8_channels_last(self):
+        N = [1, 2]
+        C = [1, 3]
+        H = [1, 3, 50, 1001]
+        W = [1, 2, 62, 1237]
+        OUT_H = [5, 21]
+        OUT_W = [7, 31]
+
+        for n, c, h, w, out_h, out_w in itertools.product(N, C, H, W, OUT_H, OUT_W):
+            input = torch.randint(0, 255, (n, c, h, w), dtype=torch.uint8).to(
+                memory_format=torch.channels_last
+            )
+            self.run_upsample_test(input, output_size=(out_h, out_w), atol=1)
+            self.run_upsample_test(
+                input,
+                scale_factors=(out_h / h, out_w / w),
+                atol=2,
+            )
diff --git a/kernels/test/op_upsample_bilinear2d_test.cpp b/kernels/test/op_upsample_bilinear2d_test.cpp
@@ -468,6 +468,28 @@ TEST_F(OpUpsampleBilinear2dTest, ZeroComputedOutputSizeDies) {
           out));
 }
 
+TEST_F(OpUpsampleBilinear2dTest, MismatchedDimOrderDies) {
+  if (SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "The current kernel supports mismatched dim order";
+  }
+
+  TensorFactory<ScalarType::Float> tf;
+
+  const auto input = tf.ones({1, 1, 1, 2});
+  auto out = tf.zeros_channels_last({1, 1, 2, 4});
+  std::array<double, 2> scale_factors = {2, 2};
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_upsample_bilinear2d_vec_out(
+          input,
+          {},
+          false,
+          OptionalArrayRef<double>(
+              {scale_factors.data(), scale_factors.size()}),
+          out));
+}
+
 TEST_F(OpUpsampleBilinear2dTest, NumericsCheck) {
   TensorFactory<ScalarType::Float> tf;
 
@@ -577,3 +599,57 @@ TEST_F(OpUpsampleBilinear2dTest, Simple5x1To4x1AlignCorners) {
 
   EXPECT_TENSOR_CLOSE(out, expected);
 }
+
+TEST_F(OpUpsampleBilinear2dTest, Simple1x2To1x4ChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  const auto input = tf.make_channels_last({1, 1, 1, 2}, {1.0, 4.0});
+  std::array<int64_t, 2> output_size = {1, 4};
+  auto out = tf.zeros_channels_last({1, 1, 1, 4});
+
+  op_upsample_bilinear2d_vec_out(
+      input,
+      OptionalArrayRef<int64_t>({output_size.data(), output_size.size()}),
+      false,
+      {},
+      out);
+
+  const auto expected =
+      tf.make_channels_last({1, 1, 1, 4}, {1.0, 1.75, 3.25, 4.0});
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpUpsampleBilinear2dTest, SmokeTestChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  const auto input = tf.make_channels_last(
+      {1, 2, 3, 4}, {0.0, 12, 1, 13, 2, 14, 3, 15, 4,  16, 5,  17,
+                     6,   18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23});
+  std::array<int64_t, 2> output_size = {6, 8};
+  auto out = tf.zeros_channels_last({1, 2, 6, 8});
+
+  op_upsample_bilinear2d_vec_out(
+      input,
+      OptionalArrayRef<int64_t>({output_size.data(), output_size.size()}),
+      false,
+      {},
+      out);
+
+  const auto expected = tf.make_channels_last(
+      {1, 2, 6, 8},
+      {0.0000, 12.0000, 0.2500,  12.2500, 0.7500,  12.7500, 1.2500,  13.2500,
+       1.7500, 13.7500, 2.2500,  14.2500, 2.7500,  14.7500, 3.0000,  15.0000,
+       1.0000, 13.0000, 1.2500,  13.2500, 1.7500,  13.7500, 2.2500,  14.2500,
+       2.7500, 14.7500, 3.2500,  15.2500, 3.7500,  15.7500, 4.0000,  16.0000,
+       3.0000, 15.0000, 3.2500,  15.2500, 3.7500,  15.7500, 4.2500,  16.2500,
+       4.7500, 16.7500, 5.2500,  17.2500, 5.7500,  17.7500, 6.0000,  18.0000,
+       5.0000, 17.0000, 5.2500,  17.2500, 5.7500,  17.7500, 6.2500,  18.2500,
+       6.7500, 18.7500, 7.2500,  19.2500, 7.7500,  19.7500, 8.0000,  20.0000,
+       7.0000, 19.0000, 7.2500,  19.2500, 7.7500,  19.7500, 8.2500,  20.2500,
+       8.7500, 20.7500, 9.2500,  21.2500, 9.7500,  21.7500, 10.0000, 22.0000,
+       8.0000, 20.0000, 8.2500,  20.2500, 8.7500,  20.7500, 9.2500,  21.2500,
+       9.7500, 21.7500, 10.2500, 22.2500, 10.7500, 22.7500, 11.0000, 23.0000});
+
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
diff --git a/kernels/test/op_upsample_nearest2d_test.cpp b/kernels/test/op_upsample_nearest2d_test.cpp
diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h