@@ -31,6 +31,8 @@ layout(push_constant) uniform PRECISION restrict Block {
31
31
layout (local_size_x_id = 0 , local_size_y_id = 1 , local_size_z_id = 2 ) in ;
32
32
layout (constant_id = 3 ) const int packed_dim = C_DIM;
33
33
34
+ #extension GL_EXT_control_flow_attributes : require
35
+
34
36
void main() {
35
37
ivec3 pos = ivec3 (gl_GlobalInvocationID);
36
38
@@ -54,11 +56,16 @@ void main() {
54
56
in_bchw_pos[out_ndims[2 ]] = pos.y;
55
57
in_bchw_pos[out_ndims[3 ]] = pos.x;
56
58
57
- for (int j = 0 ; j < 4 ; ++ j) {
59
+ const int in_packed_dim_size = in_sizes[3 - out_ndims[in_packed_dim_bchw_index]];
60
+
61
+ [[unroll]] for (int j = 0 , bchw_index = in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]; j < 4 ; ++ j, ++ bchw_index) {
58
62
// terminate the loop if trying to access input texture out of bounds
59
- if (any ( greaterThanEqual (in_bchw_pos.wzyx, in_sizes.xyzw)) ) {
63
+ if (bchw_index >= in_packed_dim_size ) {
60
64
break ;
61
65
}
66
+ // go to position in the input, that is mapped to the packed dim in the output
67
+ in_bchw_pos[out_ndims[in_packed_dim_bchw_index]] = bchw_index;
68
+
62
69
ivec3 fetch_pos;
63
70
64
71
fetch_pos.xy = in_bchw_pos.wz;
@@ -74,9 +81,6 @@ void main() {
74
81
// fetch input texel
75
82
VEC4_T inval = VEC4_T(load_texel(t_in, fetch_pos));
76
83
outval[j] = inval[in_packed_dim_lane_index];
77
-
78
- // go to next position in the input, that is mapped to the packed dim in the output
79
- in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]++ ;
80
84
}
81
85
82
86
pos[packed_dim] = int (gl_GlobalInvocationID[packed_dim]);
0 commit comments