Skip to content

Commit 4ed96bc

Browse files
pytorchbotkirklandsign
authored andcommitted
[ET-VK] Moving repeat functionality from copy_packed_dim_offset into a separate repeat shader. (#9814)
This diff includes changes to the Vulkan backend for Executorch, specifically in the implementation of the Copy and Repeat ops. The changes add a new GLSL shader for repeat op, which allows for repeated copying from an input tensor. Differential Revision: [D71511822](https://our.internmc.facebook.com/intern/diff/D71511822/)
1 parent dda8f4e commit 4ed96bc

File tree

6 files changed

+229
-167
lines changed

6 files changed

+229
-167
lines changed

backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl

Lines changed: 6 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
2121
layout(push_constant) uniform restrict Block {
2222
ivec4 range;
2323

24-
// if not repeating
2524
// xyz is source offset w is channel size
26-
// if repeating
27-
// xyzw is source tensor sizes in WHCB dims respectively
2825
ivec4 src_offset;
2926

30-
// if not repeating
3127
// xyz is destination offset w is channel size
32-
// if repeating
33-
// xyzw is destination tensor sizes in WHCB dims respectively
3428
ivec4 dst_offset;
3529
};
3630

@@ -45,9 +39,13 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
4539
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
4640
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
4741

48-
${layout_declare_spec_const(C, "int", "repeat", "0")}
42+
void main() {
43+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
44+
45+
if (any(greaterThanEqual(pos, range.xyz))) {
46+
return;
47+
}
4948

50-
void no_repeat_copy(ivec3 pos) {
5149
// Position in input tensor
5250
ivec3 in_pos = pos + src_offset.xyz;
5351
in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
@@ -135,103 +133,3 @@ void no_repeat_copy(ivec3 pos) {
135133
out_value,
136134
out_axis_map);
137135
}
138-
139-
void repeat_copy(ivec3 pos) {
140-
// expand position in packed dim
141-
pos[packed_dim] <<= 2;
142-
143-
// channel size aligned by 4 when tensors are channel packed raw value otherwise
144-
const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z);
145-
146-
// find input texel's WHCB index
147-
const int width_index = pos.x % src_offset.x;
148-
const int height_index = pos.y % src_offset.y;
149-
int channel_index;
150-
int batch_index;
151-
152-
// if tensors are channel packed
153-
if (packed_dim == C_DIM) {
154-
// the output channels in a batch will be channel size * channel repetitions aligned by 4
155-
const int out_channel_size = alignup4(src_offset.z * dst_offset.z);
156-
157-
// batch index in the output
158-
const int out_pos_batch_index = pos.z / out_channel_size;
159-
160-
// source batch index for based on current output pos
161-
batch_index = out_pos_batch_index % src_offset.w;
162-
163-
// batch repetition count for current output pos
164-
const int batch_repetition_index = out_pos_batch_index / src_offset.w;
165-
166-
// calculate input channel index based on current output pos and batch index
167-
// its done this way because we want source channel to restart from zero when a batch index increments
168-
// also batch_index will reset to zero after hitting batch repetition count
169-
// so track the current repetition in batch_repetition_index so it can be used for determining current_index
170-
channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z;
171-
} else {
172-
// the output channels in a batch will be channel size * channel repetitions
173-
const int out_channel_size = src_offset.z * dst_offset.z;
174-
175-
// source batch index for based on current output pos
176-
batch_index = (pos.z / out_channel_size) % src_offset.w;
177-
178-
// source channel index is current output pos wrapped based on channel count
179-
channel_index = pos.z % src_offset.z;
180-
}
181-
182-
// input texel's WCB position
183-
const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
184-
185-
// squeeze position in packed dim
186-
pos[packed_dim] >>= 2;
187-
188-
// packed dim index of texel last fetched
189-
int fetched_in_pos_packed_dim = -1;
190-
191-
// fetched input texel
192-
VEC4_T in_value;
193-
194-
// output texel value
195-
VEC4_T out_value = VEC4_T(0);
196-
197-
int src_lane_offset = in_pos[packed_dim];
198-
199-
for (int i=0; i<4; i++) {
200-
if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
201-
fetched_in_pos_packed_dim = (src_lane_offset >> 2);
202-
203-
ivec3 curr_in_pos = in_pos;
204-
curr_in_pos[packed_dim] = src_lane_offset;
205-
curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
206-
curr_in_pos[packed_dim] >>= 2;
207-
208-
in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map);
209-
}
210-
211-
out_value[i] = in_value[src_lane_offset & 0x3];
212-
213-
src_lane_offset++;
214-
// if packed index exceeded source packed dim round to zero
215-
src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]);
216-
}
217-
218-
write_texel_lpos(
219-
t_out,
220-
pos,
221-
out_value,
222-
out_axis_map);
223-
}
224-
225-
void main() {
226-
const ivec3 pos = ivec3(gl_GlobalInvocationID);
227-
228-
if (any(greaterThanEqual(pos, range.xyz))) {
229-
return;
230-
}
231-
232-
if (repeat == 1) {
233-
repeat_copy(pos);
234-
} else {
235-
no_repeat_copy(pos);
236-
}
237-
}
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
15+
layout(std430) buffer;
16+
17+
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
18+
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
19+
20+
layout(push_constant) uniform restrict Block {
21+
ivec4 range;
22+
// source tensor sizes in WHCB dims respectively
23+
ivec4 src_dims;
24+
// destination tensor repeats in WHCB dims respectively
25+
ivec4 dst_repeats;
26+
};
27+
28+
#include "indexing_utils.h"
29+
30+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
31+
32+
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
33+
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
34+
const lowp int packed_dim = unhash_packed_dim(out_layout);
35+
36+
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
37+
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
38+
39+
void main() {
40+
ivec3 pos = ivec3(gl_GlobalInvocationID);
41+
42+
if (any(greaterThanEqual(pos, range.xyz))) {
43+
return;
44+
}
45+
46+
// expand position in packed dim
47+
pos[packed_dim] <<= 2;
48+
49+
// channel size aligned by 4 when tensors are channel packed raw value otherwise
50+
const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z);
51+
52+
// find input texel's WHCB index
53+
const int width_index = pos.x % src_dims.x;
54+
const int height_index = pos.y % src_dims.y;
55+
int channel_index;
56+
int batch_index;
57+
58+
// if tensors are channel packed
59+
if (packed_dim == C_DIM) {
60+
// the output channels in a batch will be channel size * channel repetitions aligned by 4
61+
const int out_channel_size = alignup4(src_dims.z * dst_repeats.z);
62+
63+
// batch index in the output
64+
const int out_pos_batch_index = pos.z / out_channel_size;
65+
66+
// source batch index for based on current output pos
67+
batch_index = out_pos_batch_index % src_dims.w;
68+
69+
// batch repetition count for current output pos
70+
const int batch_repetition_index = out_pos_batch_index / src_dims.w;
71+
72+
// calculate input channel index based on current output pos and batch index
73+
// its done this way because we want source channel to restart from zero when a batch index increments
74+
// also batch_index will reset to zero after hitting batch repetition count
75+
// so track the current repetition in batch_repetition_index so it can be used for determining current_index
76+
channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z;
77+
} else {
78+
// the output channels in a batch will be channel size * channel repetitions
79+
const int out_channel_size = src_dims.z * dst_repeats.z;
80+
81+
// source batch index for based on current output pos
82+
batch_index = (pos.z / out_channel_size) % src_dims.w;
83+
84+
// source channel index is current output pos wrapped based on channel count
85+
channel_index = pos.z % src_dims.z;
86+
}
87+
88+
// input texel's WCB position
89+
const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
90+
91+
// squeeze position in packed dim
92+
pos[packed_dim] >>= 2;
93+
94+
// packed dim index of texel last fetched
95+
int fetched_in_pos_packed_dim = -1;
96+
97+
// fetched input texel
98+
VEC4_T in_value;
99+
100+
// output texel value
101+
VEC4_T out_value = VEC4_T(0);
102+
103+
int src_lane_offset = in_pos[packed_dim];
104+
105+
for (int i=0; i<4; i++) {
106+
if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
107+
fetched_in_pos_packed_dim = (src_lane_offset >> 2);
108+
109+
ivec3 curr_in_pos = in_pos;
110+
curr_in_pos[packed_dim] = src_lane_offset;
111+
curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
112+
curr_in_pos[packed_dim] >>= 2;
113+
114+
in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map));
115+
}
116+
117+
out_value[i] = in_value[src_lane_offset & 0x3];
118+
119+
src_lane_offset++;
120+
// if packed index exceeded source packed dim round to zero
121+
src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]);
122+
}
123+
124+
write_texel_lpos(
125+
t_out,
126+
pos,
127+
out_value,
128+
out_axis_map);
129+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
repeat:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
STORAGE: texture3d
6+
generate_variant_forall:
7+
DTYPE:
8+
- VALUE: half
9+
- VALUE: float
10+
- VALUE: int
11+
- VALUE: int8
12+
- VALUE: uint8
13+
shader_variants:
14+
- NAME: repeat

backends/vulkan/runtime/graph/ops/impl/Copy.cpp

Lines changed: 44 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -71,21 +71,17 @@ void add_copy_packed_dim_offset_node(
7171
const ivec3& range,
7272
const ivec4& src_offset,
7373
const ivec4& dst_offset,
74-
const ValueRef out,
75-
bool repeat) {
74+
const ValueRef out) {
7675
vTensorPtr t_in = graph.get_tensor(in);
7776
vTensorPtr t_out = graph.get_tensor(out);
7877

79-
// Check the packed dimension is same for both tensors
80-
VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
81-
if (!repeat) {
82-
// For non repeat copy also check if the packed dimension is Width or
83-
// Height. Since the function does not support channel packing.
84-
VK_CHECK_COND(
85-
check_same_packed_dim(*t_in, *t_out) &&
86-
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
87-
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
88-
}
78+
// Check the packed dimension is same for both tensors, also check if the
79+
// packed dimension is Width or Height. Since the function does not support
80+
// channel packing.
81+
VK_CHECK_COND(
82+
check_same_packed_dim(*t_in, *t_out) &&
83+
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
84+
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
8985

9086
std::string kernel_name = "copy_packed_dim_offset";
9187
kernel_name.reserve(kShaderNameReserve);
@@ -96,43 +92,41 @@ void add_copy_packed_dim_offset_node(
9692
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
9793
ivec3 global_wg_size = t_out->logical_limits();
9894

99-
if (!repeat) {
100-
const auto packed_dim = t_in->packed_dim();
101-
// The starting offset in a texel where this tensor will start copying from
102-
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
103-
// The starting offset in a texel where this tensor will start copying to
104-
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
105-
106-
// The total packed texels this tensor will be copied from
107-
// The first texel of tensor data in packed dimension will be copied from
108-
// remaining lanes from current source Hence (4 - src_lane_offset) is added
109-
// to tensor size in packed dimension
110-
const auto src_packed_size = utils::div_up_4(
111-
(4 - src_lane_offset) +
112-
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
113-
114-
// The total packed texels this tensor will be copied to
115-
// The first texel of tensor data in packed dimension will be copied to
116-
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added
117-
// to tensor size in packed dimension
118-
const auto dst_packed_size = utils::div_up_4(
119-
(4 - dst_lane_offset) +
120-
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
121-
122-
// If the starting src offset is not 0, and the total packed texels is
123-
// greater than the source texel range
124-
const bool has_additional_src_work =
125-
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
126-
// If the starting dst offset is not 0, and the total packed texels is
127-
// greater than the source texel range
128-
const bool has_additional_dst_work =
129-
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
130-
131-
if (has_additional_src_work || has_additional_dst_work) {
132-
global_wg_size[packed_dim]++; // Increase the global work group size in
133-
// packed dimension
134-
final_range[packed_dim]++; // Increase the range in packed dimension
135-
}
95+
const auto packed_dim = t_in->packed_dim();
96+
// The starting offset in a texel where this tensor will start copying from
97+
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
98+
// The starting offset in a texel where this tensor will start copying to
99+
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
100+
101+
// The total packed texels this tensor will be copied from
102+
// The first texel of tensor data in packed dimension will be copied from
103+
// remaining lanes from current source Hence (4 - src_lane_offset) is added
104+
// to tensor size in packed dimension
105+
const auto src_packed_size = utils::div_up_4(
106+
(4 - src_lane_offset) +
107+
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
108+
109+
// The total packed texels this tensor will be copied to
110+
// The first texel of tensor data in packed dimension will be copied to
111+
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added
112+
// to tensor size in packed dimension
113+
const auto dst_packed_size = utils::div_up_4(
114+
(4 - dst_lane_offset) +
115+
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
116+
117+
// If the starting src offset is not 0, and the total packed texels is
118+
// greater than the source texel range
119+
const bool has_additional_src_work =
120+
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
121+
// If the starting dst offset is not 0, and the total packed texels is
122+
// greater than the source texel range
123+
const bool has_additional_dst_work =
124+
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
125+
126+
if (has_additional_src_work || has_additional_dst_work) {
127+
global_wg_size[packed_dim]++; // Increase the global work group size in
128+
// packed dimension
129+
final_range[packed_dim]++; // Increase the range in packed dimension
136130
}
137131

138132
auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -151,7 +145,7 @@ void add_copy_packed_dim_offset_node(
151145
// Parameter buffers
152146
{},
153147
// Specialization Constants
154-
{graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0},
148+
{graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
155149
nullptr,
156150
{},
157151
{

0 commit comments

Comments
 (0)