Skip to content

Commit f5bbd5a

Browse files
committed
[mlir][ArmSME] Add support for lowering masked tile_store ops
This patch extends ArmSMEToSCF to support lowering of masked tile_store ops. Only masks created by 'vector.create_mask' are currently supported. Example: %mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1> arm_sme.tile_store %tile, %dest[%c0, %c0], %mask : memref<?x?xi32>, vector<[4]x[4]xi32> Produces: %num_rows = arith.constant 3 : index %num_cols = vector.create_mask %c2 : vector<[4]xi1> scf.for %slice_idx = %c0 to %num_rows step %c1 arm_sme.store_tile_slice %tile, %slice_idx, %num_cols, %dest[%slice_idx, %c0] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
1 parent e5e3e14 commit f5bbd5a

File tree

2 files changed

+66
-24
lines changed

2 files changed

+66
-24
lines changed

mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -420,38 +420,59 @@ struct TileStoreOpConversion : public OpRewritePattern<arm_sme::TileStoreOp> {
420420
auto tileType = tileStoreOp.getVectorType();
421421
auto tileElementType = tileType.getElementType();
422422

423-
// Create a loop that stores each ZA tile slice from memory.
423+
auto predicateType =
424+
VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
425+
426+
Value maskCols;
427+
Value upperBound;
428+
auto maskOp = tileStoreOp.getMask();
429+
if (maskOp) {
430+
auto createMaskOp = maskOp.getDefiningOp<vector::CreateMaskOp>();
431+
if (!createMaskOp)
432+
return rewriter.notifyMatchFailure(
433+
tileStoreOp, "unsupported mask op, only 'vector.create_mask' is "
434+
"currently supported");
435+
436+
auto numRows = createMaskOp.getOperands()[0];
437+
auto numCols = createMaskOp.getOperands()[1];
438+
439+
upperBound = numRows;
440+
maskCols =
441+
rewriter.create<vector::CreateMaskOp>(loc, predicateType, numCols);
442+
} else {
443+
// Store all tile slices if no mask.
444+
auto minTileSlices = rewriter.create<arith::ConstantIndexOp>(
445+
loc, arm_sme::getSMETileSliceMinNumElts(tileElementType));
446+
auto vscale =
447+
rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
448+
// This describes both the number of ZA tile slices and the number of
449+
// elements in a vector of SVL bits for a given element type (SVL_B,
450+
// SVL_H,
451+
// ..., SVL_Q).
452+
auto numTileSlices =
453+
rewriter.create<arith::MulIOp>(loc, minTileSlices, vscale);
454+
455+
upperBound = numTileSlices;
456+
// Create an 'all true' predicate for the tile slice.
457+
maskCols = rewriter.create<arith::ConstantOp>(
458+
loc, DenseElementsAttr::get(predicateType, true));
459+
}
460+
461+
// Create a loop that stores each (active) active ZA tile slice from memory.
424462
auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
425-
auto minTileSlices = rewriter.create<arith::ConstantIndexOp>(
426-
loc, arm_sme::getSMETileSliceMinNumElts(tileElementType));
427-
auto vscale =
428-
rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
429463
auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
430-
// This describes both the number of ZA tile slices and the number of
431-
// elements in a vector of SVL bits for a given element type (SVL_B, SVL_H,
432-
// ..., SVL_Q).
433-
auto numTileSlices =
434-
rewriter.create<arith::MulIOp>(loc, minTileSlices, vscale);
435-
auto forOp =
436-
rewriter.create<scf::ForOp>(loc, lowerBound, numTileSlices, step);
464+
auto forOp = rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
437465

438466
rewriter.setInsertionPointToStart(forOp.getBody());
439467

440-
// Create an 'all true' predicate for the tile slice.
441-
auto predicateType =
442-
VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
443-
auto allTruePredicate = rewriter.create<arith::ConstantOp>(
444-
loc, DenseElementsAttr::get(predicateType, true));
445-
446468
SmallVector<Value> memrefIndices;
447469
auto tileSliceIndex = forOp.getInductionVar();
448470
getMemrefIndices(tileStoreOp.getIndices(),
449471
tileStoreOp.getMemRefType().getRank(), tileSliceIndex,
450-
numTileSlices, memrefIndices, loc, rewriter);
472+
upperBound, memrefIndices, loc, rewriter);
451473
rewriter.replaceOpWithNewOp<arm_sme::StoreTileSliceOp>(
452-
tileStoreOp, tileStoreOp.getValueToStore(), tileSliceIndex,
453-
allTruePredicate, tileStoreOp.getBase(), memrefIndices,
454-
tileStoreOp.getLayout());
474+
tileStoreOp, tileStoreOp.getValueToStore(), tileSliceIndex, maskCols,
475+
tileStoreOp.getBase(), memrefIndices, tileStoreOp.getLayout());
455476

456477
return success();
457478
}

mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,9 @@ func.func @arm_sme_tile_load_hor_with_mask_and_nonzero_pad(%src : memref<?x?xi32
102102
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
103103
// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
104104
// CHECK-DAG: %[[VSCALE:.*]] = vector.vscale
105-
// CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
105+
// CHECK-DAG: %[[PTRUE_S:.*]] = arith.constant dense<true> : vector<[4]xi1>
106+
// CHECK-DAG: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
106107
// CHECK: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] {
107-
// CHECK: %[[PTRUE_S:.*]] = arith.constant dense<true> : vector<[4]xi1>
108108
// CHECK: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
109109
// CHECK: arm_sme.store_tile_slice %[[TILE]], %[[TILE_SLICE_INDEX]], %[[PTRUE_S]], %[[DEST]]{{\[}}%[[OFFSET]], %[[C0]]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
110110
func.func @arm_sme_tile_store_hor(%tile : vector<[4]x[4]xi32>, %dest : memref<?x?xi32>) {
@@ -123,6 +123,27 @@ func.func @arm_sme_tile_store_ver(%tile : vector<[4]x[4]xi32>, %dest : memref<?x
123123
return
124124
}
125125

126+
// -----
127+
128+
// CHECK-LABEL: func.func @arm_sme_tile_store_hor_with_mask(
129+
// CHECK-SAME: %[[TILE:.*]]: vector<[4]x[4]xi32>,
130+
// CHECK-SAME: %[[DEST:.*]]: memref<?x?xi32>) {
131+
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
132+
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
133+
// CHECK-DAG: %[[NUM_ROWS:.*]] = arith.constant 3 : index
134+
// CHECK-DAG: %[[NUM_COLS:.*]] = vector.create_mask %c2 : vector<[4]xi1>
135+
// CHECK-NEXT: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_ROWS]] step %[[C1]] {
136+
// CHECK-NEXT: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
137+
// CHECK-NEXT: arm_sme.store_tile_slice %[[TILE]], %[[TILE_SLICE_INDEX]], %[[NUM_COLS]], %[[DEST]]{{\[}}%[[OFFSET]], %[[C0]]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
138+
func.func @arm_sme_tile_store_hor_with_mask(%tile : vector<[4]x[4]xi32>, %dest : memref<?x?xi32>) {
139+
%c0 = arith.constant 0 : index
140+
%c2 = arith.constant 2 : index
141+
%c3 = arith.constant 3 : index
142+
%mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1>
143+
arm_sme.tile_store %tile, %dest[%c0, %c0], %mask : memref<?x?xi32>, vector<[4]x[4]xi32>
144+
return
145+
}
146+
126147
//===----------------------------------------------------------------------===//
127148
// vector.print
128149
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)