@@ -189,3 +189,168 @@ module attributes {transform.with_named_sequence} {
189
189
transform.yield
190
190
}
191
191
}
192
+
193
+ // -----
194
+
195
+ func.func @vectorize_dynamic_reduction_scalable_1d (%arg0: tensor <?xf32 >,
196
+ %arg1: tensor <f32 >) -> tensor <f32 > {
197
+
198
+ %0 = linalg.reduce ins (%arg0 : tensor <?xf32 >) outs (%arg1 : tensor <f32 >) dimensions = [0 ]
199
+ (%in: f32 , %init: f32 ) {
200
+ %0 = arith.addf %in , %init : f32
201
+ linalg.yield %0 : f32
202
+ }
203
+ return %0 : tensor <f32 >
204
+ }
205
+
206
+ // CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d(
207
+ // CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
208
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
209
+ // CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?xf32>
210
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
211
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
212
+ // CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
213
+ // CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
214
+ // CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
215
+ // CHECK: %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor<f32>, vector<f32>
216
+ // CHECK: %[[ACC_f32:.*]] = vector.extractelement %[[VEC_RD_1]][] : vector<f32>
217
+ // CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
218
+ // CHECK: %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector<f32>
219
+ // CHECK: %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector<f32>, tensor<f32>
220
+
221
+ module attributes {transform.with_named_sequence } {
222
+ transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
223
+ %0 = transform.structured.match ops {[" linalg.reduce" ]} in %arg1 : (!transform.any_op ) -> !transform.any_op
224
+ transform.structured.vectorize %0 vector_sizes [[4 ]] : !transform.any_op
225
+ transform.yield
226
+ }
227
+ }
228
+
229
+ // -----
230
+
231
+ // Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
232
+ func.func @vectorize_dynamic_reduction_scalable_2d (%arg0: tensor <?x?xf32 >,
233
+ %arg1: tensor <?xf32 >) -> tensor <?xf32 > {
234
+ %0 = linalg.generic { index ing_maps = [affine_map <(d0 , d1 ) -> (d0 , d1 )>,
235
+ affine_map <(d0 , d1 ) -> (d0 )>],
236
+ iterator_types = [" parallel" , " reduction" ] }
237
+ ins (%arg0 : tensor <?x?xf32 >)
238
+ outs (%arg1 : tensor <?xf32 >) {
239
+ ^bb (%in: f32 , %out: f32 ) :
240
+ %0 = arith.addf %in , %out : f32
241
+ linalg.yield %0 : f32
242
+ } -> tensor <?xf32 >
243
+ return %0 : tensor <?xf32 >
244
+ }
245
+
246
+ // CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d(
247
+ // CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
248
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
249
+ // CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
250
+ // CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
251
+ // CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
252
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
253
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
254
+ // CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1>
255
+ // CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32>
256
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
257
+ // CHECK: %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
258
+ // CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
259
+ // CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32>
260
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
261
+ // CHECK: %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
262
+
263
+ module attributes {transform.with_named_sequence } {
264
+ transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
265
+ %0 = transform.structured.match ops {[" linalg.generic" ]} in %arg1 : (!transform.any_op ) -> !transform.any_op
266
+ transform.structured.vectorize %0 vector_sizes [4 , [8 ]] : !transform.any_op
267
+ transform.yield
268
+ }
269
+ }
270
+
271
+ // -----
272
+
273
+ func.func @vectorize_dynamic_matvec_trailing_reduction_dim (%arg0: tensor <?x?xf32 >,
274
+ %arg1: tensor <?xf32 >,
275
+ %arg2: tensor <?xf32 >) {
276
+ linalg.matvec ins (%arg0 , %arg1 : tensor <?x?xf32 >, tensor <?xf32 >)
277
+ outs (%arg2 : tensor <?xf32 >) -> tensor <?xf32 >
278
+ return
279
+ }
280
+
281
+ // CHECK-LABEL: func.func @vectorize_dynamic_matvec_trailing_reduction_dim(
282
+ // CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) {
283
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
284
+ // CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
285
+ // CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
286
+ // CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
287
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
288
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
289
+ // CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1>
290
+ // CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32>
291
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
292
+ // CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1>
293
+ // CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32>
294
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
295
+ // CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
296
+ // CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
297
+ // CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32>
298
+ // CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32>
299
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
300
+ // CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
301
+
302
+ module attributes {transform.with_named_sequence } {
303
+ transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
304
+ %0 = transform.structured.match ops {[" linalg.matvec" ]} in %arg1 : (!transform.any_op ) -> !transform.any_op
305
+ transform.structured.vectorize %0 vector_sizes [4 , [4 ]] : !transform.any_op
306
+ transform.yield
307
+ }
308
+ }
309
+
310
+ // -----
311
+
312
+ func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim (%arg0: tensor <?x?xf32 >,
313
+ %arg1: tensor <?xf32 >,
314
+ %arg2: tensor <?xf32 >) -> tensor <?xf32 > {
315
+ %0 = linalg.generic { index ing_maps = [affine_map <(d0 , d1 ) -> (d0 , d1 )>,
316
+ affine_map <(d0 , d1 ) -> (d1 )>,
317
+ affine_map <(d0 , d1 ) -> (d0 )>],
318
+ iterator_types = [" parallel" , " reduction" ] }
319
+ ins (%arg0 , %arg1 : tensor <?x?xf32 >, tensor <?xf32 >)
320
+ outs (%arg2 : tensor <?xf32 >) {
321
+ ^bb (%mat: f32 , %vec: f32 , %res: f32 ) :
322
+ %0 = arith.mulf %mat , %vec : f32
323
+ %1 = arith.addf %res , %0 : f32
324
+ linalg.yield %1 : f32
325
+ } -> tensor <?xf32 >
326
+ return %0 : tensor <?xf32 >
327
+ }
328
+
329
+ // CHECK-LABEL: func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(
330
+ // CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> tensor<?xf32> {
331
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
332
+ // CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
333
+ // CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
334
+ // CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
335
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
336
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
337
+ // CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1>
338
+ // CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32>
339
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
340
+ // CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1>
341
+ // CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32>
342
+ // CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
343
+ // CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
344
+ // CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
345
+ // CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32>
346
+ // CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32>
347
+ // CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
348
+ // CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32>
349
+
350
+ module attributes {transform.with_named_sequence } {
351
+ transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
352
+ %0 = transform.structured.match ops {[" linalg.generic" ]} in %arg1 : (!transform.any_op ) -> !transform.any_op
353
+ transform.structured.vectorize %0 vector_sizes [[4 ], 4 ] : !transform.any_op
354
+ transform.yield
355
+ }
356
+ }
0 commit comments