[ET-VK] Moving repeat functionality from copy_packed_dim_offset into a separate repeat shader. (#9814)

pytorchbot · kirklandsign · commit 4ed96bc0a970 · 2025-04-11T14:32:59.000-07:00
This diff includes changes to the Vulkan backend for Executorch, specifically in the implementation of the Copy and Repeat ops. The changes add a new GLSL shader for repeat op, which allows for repeated copying from an input tensor. Differential Revision: [D71511822](https://our.internmc.facebook.com/intern/diff/D71511822/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
@@ -21,16 +21,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 layout(push_constant) uniform restrict Block {
   ivec4 range;
 
-  // if not repeating
   // xyz is source offset w is channel size
-  // if repeating
-  // xyzw is source tensor sizes in WHCB dims respectively
   ivec4 src_offset;
 
-  // if not repeating
   // xyz is destination offset w is channel size
-  // if repeating
-  // xyzw is destination tensor sizes in WHCB dims respectively
   ivec4 dst_offset;
 };
 
@@ -45,9 +39,13 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-${layout_declare_spec_const(C, "int", "repeat", "0")}
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, range.xyz))) {
+    return;
+  }
 
-void no_repeat_copy(ivec3 pos) {
   // Position in input tensor
   ivec3 in_pos = pos + src_offset.xyz;
   in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
@@ -135,103 +133,3 @@ void no_repeat_copy(ivec3 pos) {
     out_value,
     out_axis_map);
 }
-
-void repeat_copy(ivec3 pos) {
-  // expand position in packed dim
-  pos[packed_dim] <<= 2;
-
-  // channel size aligned by 4 when tensors are channel packed raw value otherwise
-  const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z);
-
-  // find input texel's WHCB index
-  const int width_index = pos.x % src_offset.x;
-  const int height_index = pos.y % src_offset.y;
-  int channel_index;
-  int batch_index;
-
-  // if tensors are channel packed
-  if (packed_dim == C_DIM) {
-    // the output channels in a batch will be channel size * channel repetitions aligned by 4
-    const int out_channel_size = alignup4(src_offset.z * dst_offset.z);
-
-    // batch index in the output
-    const int out_pos_batch_index = pos.z / out_channel_size;
-
-    // source batch index for based on current output pos
-    batch_index = out_pos_batch_index % src_offset.w;
-
-    // batch repetition count for current output pos
-    const int batch_repetition_index = out_pos_batch_index / src_offset.w;
-
-    // calculate input channel index based on current output pos and batch index
-    // its done this way because we want source channel to restart from zero when a batch index increments
-    // also batch_index will reset to zero after hitting batch repetition count
-    // so track the current repetition in batch_repetition_index so it can be used for determining current_index
-    channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z;
-  } else {
-    // the output channels in a batch will be channel size * channel repetitions
-    const int out_channel_size = src_offset.z * dst_offset.z;
-
-    // source batch index for based on current output pos
-    batch_index = (pos.z / out_channel_size) % src_offset.w;
-
-    // source channel index is current output pos wrapped based on channel count
-    channel_index = pos.z % src_offset.z;
-  }
-
-  // input texel's WCB position
-  const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
-
-  // squeeze position in packed dim
-  pos[packed_dim] >>= 2;
-
-  // packed dim index of texel last fetched
-  int fetched_in_pos_packed_dim = -1;
-
-  // fetched input texel
-  VEC4_T in_value;
-
-  // output texel value
-  VEC4_T out_value = VEC4_T(0);
-
-  int src_lane_offset = in_pos[packed_dim];
-
-  for (int i=0; i<4; i++) {
-    if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
-      fetched_in_pos_packed_dim = (src_lane_offset >> 2);
-
-      ivec3 curr_in_pos = in_pos;
-      curr_in_pos[packed_dim] = src_lane_offset;
-      curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
-      curr_in_pos[packed_dim] >>= 2;
-
-      in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map);
-    }
-
-    out_value[i] = in_value[src_lane_offset & 0x3];
-
-    src_lane_offset++;
-    // if packed index exceeded source packed dim round to zero
-    src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]);
-  }
-
-  write_texel_lpos(
-    t_out,
-    pos,
-    out_value,
-    out_axis_map);
-}
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
-
-  if (repeat == 1) {
-    repeat_copy(pos);
-  } else {
-    no_repeat_copy(pos);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 range;
+  // source tensor sizes in WHCB dims respectively
+  ivec4 src_dims;
+  // destination tensor repeats in WHCB dims respectively
+  ivec4 dst_repeats;
+};
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, range.xyz))) {
+    return;
+  }
+
+  // expand position in packed dim
+  pos[packed_dim] <<= 2;
+
+  // channel size aligned by 4 when tensors are channel packed raw value otherwise
+  const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z);
+
+  // find input texel's WHCB index
+  const int width_index = pos.x % src_dims.x;
+  const int height_index = pos.y % src_dims.y;
+  int channel_index;
+  int batch_index;
+
+  // if tensors are channel packed
+  if (packed_dim == C_DIM) {
+    // the output channels in a batch will be channel size * channel repetitions aligned by 4
+    const int out_channel_size = alignup4(src_dims.z * dst_repeats.z);
+
+    // batch index in the output
+    const int out_pos_batch_index = pos.z / out_channel_size;
+
+    // source batch index for based on current output pos
+    batch_index = out_pos_batch_index % src_dims.w;
+
+    // batch repetition count for current output pos
+    const int batch_repetition_index = out_pos_batch_index / src_dims.w;
+
+    // calculate input channel index based on current output pos and batch index
+    // its done this way because we want source channel to restart from zero when a batch index increments
+    // also batch_index will reset to zero after hitting batch repetition count
+    // so track the current repetition in batch_repetition_index so it can be used for determining current_index
+    channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z;
+  } else {
+    // the output channels in a batch will be channel size * channel repetitions
+    const int out_channel_size = src_dims.z * dst_repeats.z;
+
+    // source batch index for based on current output pos
+    batch_index = (pos.z / out_channel_size) % src_dims.w;
+
+    // source channel index is current output pos wrapped based on channel count
+    channel_index = pos.z % src_dims.z;
+  }
+
+  // input texel's WCB position
+  const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
+
+  // squeeze position in packed dim
+  pos[packed_dim] >>= 2;
+
+  // packed dim index of texel last fetched
+  int fetched_in_pos_packed_dim = -1;
+
+  // fetched input texel
+  VEC4_T in_value;
+
+  // output texel value
+  VEC4_T out_value = VEC4_T(0);
+
+  int src_lane_offset = in_pos[packed_dim];
+
+  for (int i=0; i<4; i++) {
+    if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
+      fetched_in_pos_packed_dim = (src_lane_offset >> 2);
+
+      ivec3 curr_in_pos = in_pos;
+      curr_in_pos[packed_dim] = src_lane_offset;
+      curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
+      curr_in_pos[packed_dim] >>= 2;
+
+      in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map));
+    }
+
+    out_value[i] = in_value[src_lane_offset & 0x3];
+
+    src_lane_offset++;
+    // if packed index exceeded source packed dim round to zero
+    src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]);
+  }
+
+  write_texel_lpos(
+    t_out,
+    pos,
+    out_value,
+    out_axis_map);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml
@@ -0,0 +1,14 @@
+repeat:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
+      - VALUE: uint8
+  shader_variants:
+    - NAME: repeat
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -71,21 +71,17 @@ void add_copy_packed_dim_offset_node(
     const ivec3& range,
     const ivec4& src_offset,
     const ivec4& dst_offset,
-    const ValueRef out,
-    bool repeat) {
+    const ValueRef out) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  // Check the packed dimension is same for both tensors
-  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
-  if (!repeat) {
-    // For non repeat copy also check if the packed dimension is Width or
-    // Height. Since the function does not support channel packing.
-    VK_CHECK_COND(
-        check_same_packed_dim(*t_in, *t_out) &&
-        (check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
-         check_packed_dim_is(*t_in, WHCN::kHeightDim)));
-  }
+  // Check the packed dimension is same for both tensors, also check if the
+  // packed dimension is Width or Height. Since the function does not support
+  // channel packing.
+  VK_CHECK_COND(
+      check_same_packed_dim(*t_in, *t_out) &&
+      (check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
+       check_packed_dim_is(*t_in, WHCN::kHeightDim)));
 
   std::string kernel_name = "copy_packed_dim_offset";
   kernel_name.reserve(kShaderNameReserve);
@@ -96,43 +92,41 @@ void add_copy_packed_dim_offset_node(
       range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
   ivec3 global_wg_size = t_out->logical_limits();
 
-  if (!repeat) {
-    const auto packed_dim = t_in->packed_dim();
-    // The starting offset in a texel where this tensor will start copying from
-    const auto src_lane_offset = src_offset[packed_dim] & 0x3;
-    // The starting offset in a texel where this tensor will start copying to
-    const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
-
-    // The total packed texels this tensor will be copied from
-    // The first texel of tensor data in packed dimension will be copied from
-    // remaining lanes from current source Hence (4 - src_lane_offset) is added
-    // to tensor size in packed dimension
-    const auto src_packed_size = utils::div_up_4(
-        (4 - src_lane_offset) +
-        dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
-
-    // The total packed texels this tensor will be copied to
-    // The first texel of tensor data in packed dimension will be copied to
-    // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
-    // to tensor size in packed dimension
-    const auto dst_packed_size = utils::div_up_4(
-        (4 - dst_lane_offset) +
-        dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
-
-    // If the starting src offset is not 0, and the total packed texels is
-    // greater than the source texel range
-    const bool has_additional_src_work =
-        src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
-    // If the starting dst offset is not 0, and the total packed texels is
-    // greater than the source texel range
-    const bool has_additional_dst_work =
-        dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
-
-    if (has_additional_src_work || has_additional_dst_work) {
-      global_wg_size[packed_dim]++; // Increase the global work group size in
-                                    // packed dimension
-      final_range[packed_dim]++; // Increase the range in packed dimension
-    }
+  const auto packed_dim = t_in->packed_dim();
+  // The starting offset in a texel where this tensor will start copying from
+  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
+  // The starting offset in a texel where this tensor will start copying to
+  const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
+
+  // The total packed texels this tensor will be copied from
+  // The first texel of tensor data in packed dimension will be copied from
+  // remaining lanes from current source Hence (4 - src_lane_offset) is added
+  // to tensor size in packed dimension
+  const auto src_packed_size = utils::div_up_4(
+      (4 - src_lane_offset) +
+      dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
+
+  // The total packed texels this tensor will be copied to
+  // The first texel of tensor data in packed dimension will be copied to
+  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
+  // to tensor size in packed dimension
+  const auto dst_packed_size = utils::div_up_4(
+      (4 - dst_lane_offset) +
+      dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
+
+  // If the starting src offset is not 0, and the total packed texels is
+  // greater than the source texel range
+  const bool has_additional_src_work =
+      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
+  // If the starting dst offset is not 0, and the total packed texels is
+  // greater than the source texel range
+  const bool has_additional_dst_work =
+      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
+
+  if (has_additional_src_work || has_additional_dst_work) {
+    global_wg_size[packed_dim]++; // Increase the global work group size in
+                                  // packed dimension
+    final_range[packed_dim]++; // Increase the range in packed dimension
   }
 
   auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -151,7 +145,7 @@ void add_copy_packed_dim_offset_node(
       // Parameter buffers
       {},
       // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0},
+      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
       nullptr,
       {},
       {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp