Skip to content

Commit 14aac43

Browse files
committed
[mlir][ArmSME] Lower transfer_write + transpose to vertical store
This patch extends the lowering of vector.transfer_write in VectorToArmSME to support in-flight transpose via SME vertical store.
1 parent f5bbd5a commit 14aac43

File tree

3 files changed

+260
-3
lines changed

3 files changed

+260
-3
lines changed

mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,31 @@ struct TransferReadToArmSMELowering
136136

137137
/// Conversion pattern for vector.transfer_write.
138138
///
139-
/// vector.transfer_write %vector, %source[%c0, %c0] : vector<[16]x[16]xi8>,
140-
/// memref<?x?xi8>
139+
/// ---
140+
///
141+
/// Example 1: op with identity permutation map to horizontal
142+
/// arm_sme.tile_store:
143+
///
144+
/// vector.transfer_write %vector, %source[%c0, %c0]
145+
/// {in_bounds = [true, true]} : vector<[16]x[16]xi8>, memref<?x?xi8>
141146
///
142147
/// is converted to:
143148
///
144149
/// arm_sme.tile_store %vector, %source[%c0, %c0] : memref<?x?xi8>,
145150
/// vector<[16]x[16]xi8>
151+
/// ---
152+
///
153+
/// Example 2: op with transpose permutation map to vertical arm_sme.tile_store
154+
/// (in-flight transpose):
155+
///
156+
/// vector.transfer_write %vector, %source[%c0, %c0]
157+
/// {permutation_map = affine_map<(d0, d1) -> (d1, d0)>,
158+
/// in_bounds = [true, true]} : vector<[16]x[16]xi8>, memref<?x?xi8>
159+
///
160+
/// is converted to:
161+
///
162+
/// arm_sme.tile_store %vector, %source[%c0, %c0] layout<vertical>
163+
/// : memref<?x?xi8>, vector<[16]x[16]xi8>
146164
struct TransferWriteToArmSMELowering
147165
: public OpRewritePattern<vector::TransferWriteOp> {
148166
using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
@@ -153,12 +171,35 @@ struct TransferWriteToArmSMELowering
153171
if (!arm_sme::isValidSMETileVectorType(vType))
154172
return failure();
155173

174+
assert(writeOp.getTransferRank() == 2 &&
175+
"expected a permutation_map with result dims of the same rank as "
176+
"the vector type");
177+
156178
if (!llvm::isa<MemRefType>(writeOp.getSource().getType()))
157179
return failure();
158180

181+
// Out-of-bounds dims are not supported.
182+
if (writeOp.hasOutOfBoundsDim())
183+
return rewriter.notifyMatchFailure(writeOp,
184+
"not inbounds transfer write");
185+
186+
arm_sme::TileSliceLayout layout;
187+
188+
AffineExpr d0, d1;
189+
bindDims(writeOp.getContext(), d0, d1);
190+
AffineMap map = writeOp.getPermutationMap();
191+
if (map.isIdentity())
192+
layout = arm_sme::TileSliceLayout::Horizontal;
193+
else if (map == AffineMap::get(map.getNumDims(), 0, {d1, d0},
194+
writeOp.getContext()))
195+
layout = arm_sme::TileSliceLayout::Vertical;
196+
else
197+
return rewriter.notifyMatchFailure(writeOp,
198+
"unsupported permutation map");
199+
159200
rewriter.replaceOpWithNewOp<arm_sme::TileStoreOp>(
160201
writeOp, writeOp.getVector(), writeOp.getSource(), writeOp.getIndices(),
161-
writeOp.getMask());
202+
writeOp.getMask(), layout);
162203
return success();
163204
}
164205
};

mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,37 @@ func.func @transfer_write_2d_with_mask_f64(%vector : vector<[2]x[2]xf64>, %dest
337337

338338
// -----
339339

340+
/// in-flight transpose via vertical store.
341+
342+
// CHECK-LABEL: func.func @transfer_write_2d_transpose_i64(
343+
// CHECK-SAME: %[[VECTOR:.*]]: vector<[2]x[2]xi64>,
344+
// CHECK-SAME: %[[DEST:.*]]: memref<?x?xi64>) {
345+
// CHECK: %[[C0:.*]] = arith.constant 0 : index
346+
// CHECK: arm_sme.tile_store %[[VECTOR]], %[[DEST]]{{\[}}%[[C0]], %[[C0]]] layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
347+
func.func @transfer_write_2d_transpose_i64(%vector : vector<[2]x[2]xi64>, %dest : memref<?x?xi64>) {
348+
%c0 = arith.constant 0 : index
349+
vector.transfer_write %vector, %dest[%c0, %c0] {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : vector<[2]x[2]xi64>, memref<?x?xi64>
350+
return
351+
}
352+
353+
// -----
354+
355+
/// in-flight transpose via vertical store with mask.
356+
357+
// CHECK-LABEL: func.func @transfer_write_2d_transpose_with_mask_bf16(
358+
// CHECK-SAME: %[[VECTOR:.*]]: vector<[8]x[8]xbf16>,
359+
// CHECK-SAME: %[[DEST:.*]]: memref<?x?xbf16>,
360+
// CHECK-SAME: %[[MASK:.*]]: vector<[8]x[8]xi1>) {
361+
// CHECK: %[[C0:.*]] = arith.constant 0 : index
362+
// CHECK: arm_sme.tile_store %[[VECTOR]], %[[DEST]]{{\[}}%[[C0]], %[[C0]]], %[[MASK]] layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
363+
func.func @transfer_write_2d_transpose_with_mask_bf16(%vector : vector<[8]x[8]xbf16>, %dest : memref<?x?xbf16>, %mask : vector<[8]x[8]xi1>) {
364+
%c0 = arith.constant 0 : index
365+
vector.transfer_write %vector, %dest[%c0, %c0], %mask {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : vector<[8]x[8]xbf16>, memref<?x?xbf16>
366+
return
367+
}
368+
369+
// -----
370+
340371
// The following tests check the 'vector.transfer_write' -> 'arm_sme.intr.zero'
341372
// lowering only occurs for vector types of correct rank, shape, element size
342373
// and number of scalable dims.
@@ -398,6 +429,17 @@ func.func @transfer_write_2d__fixed(%vector : vector<16x16xi8>, %dest : memref<?
398429
return
399430
}
400431

432+
// -----
433+
434+
// CHECK-LABEL: @transfer_write_2d__out_of_bounds
435+
// CHECK: vector.transfer_write
436+
// CHECK-NOT: arm_sme.tile_store
437+
func.func @transfer_write_2d__out_of_bounds(%vector : vector<[4]x[4]xf32>, %dest : memref<?x?xf32>) {
438+
%c0 = arith.constant 0 : index
439+
vector.transfer_write %vector, %dest[%c0, %c0] : vector<[4]x[4]xf32>, memref<?x?xf32>
440+
return
441+
}
442+
401443
//===----------------------------------------------------------------------===//
402444
// vector.broadcast
403445
//===----------------------------------------------------------------------===//
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// DEFINE: %{entry_point} = entry
2+
// DEFINE: %{compile} = mlir-opt %s \
3+
// DEFINE: -enable-arm-streaming="mode=locally enable-za" \
4+
// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
5+
// DEFINE: -convert-vector-to-llvm="enable-arm-sme" -cse -canonicalize \
6+
// DEFINE: -allocate-arm-sme-tiles -test-lower-to-llvm
7+
// DEFINE: %{run} = %mcr_aarch64_cmd \
8+
// DEFINE: -march=aarch64 -mattr=+sve,+sme \
9+
// DEFINE: -e %{entry_point} -entry-point-result=void \
10+
// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
11+
12+
// RUN: %{compile} | %{run} | FileCheck %s
13+
14+
llvm.func @printCString(!llvm.ptr<i8>)
15+
16+
// TODO: replace with vector.print <str> once #68695 lands.
17+
func.func @print_str(%str: !llvm.ptr<array<17 x i8>>) attributes { enable_arm_streaming_ignore } {
18+
%c0 = llvm.mlir.constant(0 : index) : i64
19+
%str_bytes = llvm.getelementptr %str[%c0, %c0]
20+
: (!llvm.ptr<array<17 x i8>>, i64, i64) -> !llvm.ptr<i8>
21+
llvm.call @printCString(%str_bytes) : (!llvm.ptr<i8>) -> ()
22+
return
23+
}
24+
25+
// Vector store.
26+
func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
27+
%c0 = arith.constant 0.0 : f32
28+
%zero = vector.splat %c0 : vector<[4]x[4]xf32>
29+
vector.transfer_write %zero, %A[%base1, %base2] {in_bounds=[true, true]} :
30+
vector<[4]x[4]xf32>, memref<?x?xf32>
31+
return
32+
}
33+
34+
// Masked vector store.
35+
func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) {
36+
%c0 = arith.constant 0.0 : f32
37+
%c2 = arith.constant 2 : index
38+
%c3 = arith.constant 3 : index
39+
%mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1>
40+
%zero = vector.splat %c0 : vector<[4]x[4]xf32>
41+
vector.transfer_write %zero, %A[%base1, %base2], %mask {in_bounds=[true, true]} :
42+
vector<[4]x[4]xf32>, memref<?x?xf32>
43+
return
44+
}
45+
46+
// Vector store + transpose.
47+
func.func @transfer_write_2d_transposed(%A : memref<?x?xf32>, %base1: index, %base2: index) {
48+
%0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
49+
vector.transfer_write %0, %A[%base1, %base2] {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} :
50+
vector<[4]x[4]xf32>, memref<?x?xf32>
51+
return
52+
}
53+
54+
// Masked vector store + transpose.
55+
func.func @transfer_write_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index, %base2: index) {
56+
%c2 = arith.constant 2 : index
57+
%c4 = arith.constant 4 : index
58+
%mask = vector.create_mask %c4, %c2 : vector<[4]x[4]xi1>
59+
%0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
60+
vector.transfer_write %0, %A[%base1, %base2], %mask {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} :
61+
vector<[4]x[4]xf32>, memref<?x?xf32>
62+
return
63+
}
64+
65+
// Vector load + print.
66+
func.func @load_and_print(%A : memref<?x?xf32>, %base1: index, %base2: index) {
67+
%tile_begin_str = llvm.mlir.addressof @tile_begin : !llvm.ptr<array<17 x i8>>
68+
69+
%0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
70+
71+
func.call @print_str(%tile_begin_str) : (!llvm.ptr<array<17 x i8>>) -> ()
72+
vector.print %0: vector<[4]x[4]xf32>
73+
74+
return
75+
}
76+
77+
// Allocate heap memory of size 'd0' x 'd1' and initialize.
78+
//
79+
// Example:
80+
//
81+
// initialize_memory(%c4, %c5)
82+
//
83+
// 0, 1, 2, 3, 4
84+
// 10, 11, 12, 13, 14
85+
// 20, 21, 22, 23, 24
86+
// 30, 31, 32, 33, 34
87+
//
88+
// Returns dynamic memref. It's the callers responsiblity to free the returned
89+
// memref.
90+
func.func @initialize_memory(%d0 : index, %d1 : index) -> memref<?x?xf32> {
91+
%c0 = arith.constant 0 : index
92+
%c1 = arith.constant 1 : index
93+
%c1_f32 = arith.constant 1.0 : f32
94+
%c10_f32 = arith.constant 10.0 : f32
95+
96+
%A = memref.alloc(%d0, %d1) : memref<?x?xf32>
97+
98+
%init = arith.constant 0.0 : f32
99+
scf.for %i = %c0 to %d0 step %c1 iter_args(%val = %init) -> f32 {
100+
scf.for %j = %c0 to %d1 step %c1 iter_args(%inner_val = %val) -> f32 {
101+
memref.store %inner_val, %A[%i, %j] : memref<?x?xf32>
102+
%inner_val_next = arith.addf %inner_val, %c1_f32 : f32
103+
scf.yield %inner_val_next : f32
104+
}
105+
%val_next = arith.addf %val, %c10_f32 : f32
106+
scf.yield %val_next : f32
107+
}
108+
109+
return %A : memref<?x?xf32>
110+
}
111+
112+
func.func @entry() {
113+
%c0 = arith.constant 0 : index
114+
%c2 = arith.constant 2 : index
115+
%c4 = arith.constant 4 : index
116+
117+
// Allocate enough memory to load a 32-bit tile plus a tiny bit more to test
118+
// non-zero offsets while remaining inbounds.
119+
%vscale = vector.vscale
120+
%svl_s = arith.muli %c4, %vscale : index
121+
%svl_s_plus_two = arith.addi %svl_s, %c2 : index
122+
123+
// 1. Initialize memory
124+
// CHECK-LABEL: TILE BEGIN:
125+
// CHECK-NEXT: ( 0, 1, 2, 3
126+
// CHECK-NEXT: ( 10, 11, 12, 13
127+
// CHECK-NEXT: ( 20, 21, 22, 23
128+
// CHECK-NEXT: ( 30, 31, 32, 33
129+
%A = call @initialize_memory(%svl_s_plus_two, %svl_s_plus_two) : (index, index) -> memref<?x?xf32>
130+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
131+
132+
// 2. Write 2-D vector of zeroes to 1. at offset [2, 2].
133+
// CHECK-LABEL: TILE BEGIN:
134+
// CHECK-NEXT: ( 0, 1, 2, 3
135+
// CHECK-NEXT: ( 10, 11, 12, 13
136+
// CHECK-NEXT: ( 20, 21, 0, 0
137+
// CHECK-NEXT: ( 30, 31, 0, 0
138+
call @transfer_write_2d(%A, %c2, %c2) : (memref<?x?xf32>, index, index) -> ()
139+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
140+
141+
// 3. Write 2-D vector of zeroes to 2. but with mask (nrows=2, ncols=3).
142+
// CHECK-LABEL: TILE BEGIN:
143+
// CHECK-NEXT: ( 0, 0, 0, 3
144+
// CHECK-NEXT: ( 0, 0, 0, 13
145+
// CHECK-NEXT: ( 20, 21, 0, 0
146+
// CHECK-NEXT: ( 30, 31, 0, 0
147+
call @transfer_write_2d_mask(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
148+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
149+
150+
// 4. Reload 3. + store + transpose.
151+
// CHECK-LABEL: TILE BEGIN:
152+
// CHECK-NEXT: ( 0, 0, 20, 30
153+
// CHECK-NEXT: ( 0, 0, 21, 31
154+
// CHECK-NEXT: ( 0, 0, 0, 0
155+
// CHECK-NEXT: ( 3, 13, 0, 0
156+
call @transfer_write_2d_transposed(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
157+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
158+
159+
// 5. Reload 4. + store + transpose but with mask (nrows=4, ncols=2).
160+
// The mask applies after permutation
161+
// CHECK-LABEL: TILE BEGIN:
162+
// CHECK-NEXT: ( 0, 0, 20, 30
163+
// CHECK-NEXT: ( 0, 0, 21, 31
164+
// CHECK-NEXT: ( 20, 21, 0, 0
165+
// CHECK-NEXT: ( 30, 31, 0, 0
166+
call @transfer_write_2d_mask_transposed(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
167+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
168+
169+
memref.dealloc %A : memref<?x?xf32>
170+
171+
return
172+
}
173+
174+
llvm.mlir.global internal constant @tile_begin("TILE BEGIN: \0A\00")

0 commit comments

Comments
 (0)