fixup! [mlir][linalg] Split GenericPadOpVectorizationPattern into two patterns

banach-space · banach-space · commit a8406b3d35db · 2024-10-27T17:37:20.000Z
* Incorporate suggestions from Hanhan * Add a negative test to document when vectorization of tensor.insert_slice might fail * Update `@pad_and_insert_slice_dest` that was added in #112504 (this change means that _all_ qualifying `tensor.insert_slice` Ops are vectorized). * Added more tests to demonstrate other cases (e.g. default vs non-default pad value).
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -256,6 +256,7 @@ void transform::ApplyFoldAddIntoDestPatternsOp::populatePatterns(
 void transform::ApplyPadVectorizationPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
   linalg::populatePadOpVectorizationPatterns(patterns);
+  linalg::populateInsertSliceVectorizationPatterns(patterns);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2514,35 +2514,18 @@ struct PadOpVectorizationWithTransferWritePattern
   }
 };
 
-/// Given an ArrayRef of OpFoldResults, return a vector of Values.
-/// IntegerAttrs are converted to ConstantIndexOps. Other attribute types are
-/// not supported.
-static SmallVector<Value> ofrToIndexValues(RewriterBase &rewriter, Location loc,
-                                           ArrayRef<OpFoldResult> ofrs) {
-  SmallVector<Value> result;
-  for (auto o : ofrs) {
-    if (auto val = llvm::dyn_cast_if_present<Value>(o)) {
-      result.push_back(val);
-    } else {
-      result.push_back(rewriter.create<arith::ConstantIndexOp>(
-          loc, cast<IntegerAttr>(cast<Attribute>(o)).getInt()));
-    }
-  }
-  return result;
-}
-
 /// Returns the effective Pad value for the input op, provided it's a scalar.
 ///
 /// Many Ops exhibit pad-like behaviour, but this isn't always explicit. If
 /// this Op performs padding, retrieve the padding value provided that it's
 /// a scalar and static/fixed for all the padded values. Returns an empty value
 /// otherwise.
-static Value getStaticPadVl(Operation *op) {
+static Value getStaticPadVal(Operation *op) {
   if (!op)
     return {};
 
-  // 1. vector.broadcast - return the value that's being broadcast,
-  // provided that it's a scalar.
+  // 1. vector.broadcast (f32 -> vector <...xf32>) - return the value that's
+  // being broadcast, provided that it's a scalar.
   if (auto bcast = llvm::dyn_cast<vector::BroadcastOp>(op)) {
     auto source = bcast.getSource();
     if (llvm::dyn_cast<VectorType>(source.getType()))
@@ -2551,31 +2534,31 @@ static Value getStaticPadVl(Operation *op) {
     return source;
   }
 
-  // 1. linalg.fill - use the scalar input value that used to fill the output
+  // 2. linalg.fill - use the scalar input value that used to fill the output
   // tensor.
   if (auto fill = llvm::dyn_cast<linalg::FillOp>(op)) {
     return fill.getInputs()[0];
   }
 
-  // 2. tensor.generateOp - can't guarantee the value is fixed without
+  // 3. tensor.generateOp - can't guarantee the value is fixed without
   // analysing, bail out.
   if (auto generate = llvm::dyn_cast<tensor::GenerateOp>(op)) {
     return {};
   }
 
-  // 3. vector.transfer_write - inspect the input vector that's written from. If
+  // 4. vector.transfer_write - inspect the input vector that's written from. If
   // if contains a single value that has been broadcast (e.g. via
   // vector.broadcast), extract it, fail otherwise.
   if (auto xferWrite = llvm::dyn_cast<vector::TransferWriteOp>(op))
-    return getStaticPadVl(xferWrite.getVector().getDefiningOp());
+    return getStaticPadVal(xferWrite.getVector().getDefiningOp());
 
-  // 4. tensor.insert_slice - inspect the destination tensor. If it's larger
+  // 5. tensor.insert_slice - inspect the destination tensor. If it's larger
   // than the input tensor, then, provided it's constant, we'll extract the
   // value that was used to generate it (via e.g. linalg.fill), fail otherwise.
   // TODO: Clarify the semantics when the input tensor is larger than the
   // destination.
   if (auto slice = llvm::dyn_cast<tensor::InsertSliceOp>(op))
-    return getStaticPadVl(slice.getDest().getDefiningOp());
+    return getStaticPadVal(slice.getDest().getDefiningOp());
 
   return {};
 }
@@ -2619,7 +2602,7 @@ struct InsertSliceVectorizePattern
     //     remains a TODO.
     //
     // When the value is not known and not needed, use 0. Otherwise, bail out.
-    Value padValue = getStaticPadVl(sliceOp);
+    Value padValue = getStaticPadVal(sliceOp);
     bool isOutOfBoundsRead = !sourceType.hasStaticShape();
 
     if (!padValue && isOutOfBoundsRead) {
@@ -2637,6 +2620,7 @@ struct InsertSliceVectorizePattern
     SmallVector<int64_t> vecShape;
     SmallVector<bool> readInBounds;
     SmallVector<bool> writeInBounds;
+    size_t rankDiff = resultType.getRank() - sourceType.getRank();
     for (unsigned i = 0; i < sourceType.getRank(); ++i) {
       if (!sourceType.isDynamicDim(i)) {
         vecShape.push_back(sourceType.getDimSize(i));
@@ -2648,7 +2632,9 @@ struct InsertSliceVectorizePattern
         // Source shape is not statically known, but result shape is.
         // Vectorize with size of result shape. This may be larger than the
         // source size.
-        vecShape.push_back(resultType.getDimSize(i));
+        // FIXME: Using rankDiff implies that the source tensor is inserted at
+        // the end of the destination tensor. However, that's not required.
+        vecShape.push_back(resultType.getDimSize(rankDiff + i));
         // Read may be out-of-bounds because the result size could be larger
         // than the source size.
         readInBounds.push_back(false);
@@ -2673,8 +2659,8 @@ struct InsertSliceVectorizePattern
         ArrayRef<bool>{readInBounds});
 
     // 4. Generate TransferWriteOp.
-    auto writeIndices =
-        ofrToIndexValues(rewriter, sliceOp.getLoc(), sliceOp.getMixedOffsets());
+    auto writeIndices = getValueOrCreateConstantIndexOp(
+        rewriter, sliceOp.getLoc(), sliceOp.getMixedOffsets());
 
     // 5. Finalize
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
@@ -2761,8 +2747,8 @@ struct PadOpVectorizationWithInsertSlicePattern
     // Generate TransferWriteOp: Write to InsertSliceOp's dest tensor at
     // specified offsets. Write is fully in-bounds because a InsertSliceOp's
     // source must fit into the destination at the specified offsets.
-    auto writeIndices =
-        ofrToIndexValues(rewriter, padOp.getLoc(), insertOp.getMixedOffsets());
+    auto writeIndices = getValueOrCreateConstantIndexOp(
+        rewriter, padOp.getLoc(), insertOp.getMixedOffsets());
     SmallVector<bool> inBounds(vecRank, true);
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
         insertOp, read, insertOp.getDest(), writeIndices,
diff --git a/mlir/test/Dialect/Linalg/vectorization-pad-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-pad-patterns.mlir
@@ -161,7 +161,8 @@ module attributes {transform.with_named_sequence} {
 
 ///----------------------------------------------------------------------------------------
 /// tensor::PadOp -> tensor::EmptyOp + linalg::FillOp/tensor::GenerateOp + tensor::InsertSliceOp
-/// [Pattern: GenericPadOpVectorizationPattern]
+/// [Pattern: GenericPadOpVectorizationPattern + InsertSliceVectorizePattern]
+/// TODO: Split the test into two, one for each pattern.
 ///----------------------------------------------------------------------------------------
 
 func.func private @make_vector() -> tensor<12x13xf32>
@@ -174,12 +175,14 @@ func.func private @make_vector() -> tensor<12x13xf32>
 //  CHECK-NOT:     tensor.pad
 //  CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
 //  CHECK-DAG:     %[[PAD:.*]] = arith.constant 5.000000e+00 : f32
+//  CHECK-DAG:     %[[PAD_READ:.*]] = arith.constant 0.000000e+00 : f32
 //      CHECK:     %[[EMPTY:.*]] = tensor.empty() : tensor<1x12x13xf32>
 //      CHECK:     %[[FILL:.*]] = linalg.fill ins(%[[PAD]] : f32) outs(%[[EMPTY]] : tensor<1x12x13xf32>) -> tensor<1x12x13xf32>
-//      CHECK:     %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x5x6xf32>, vector<1x5x6xf32>
-//      CHECK:     %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x5x6xf32>, tensor<1x12x13xf32>
+//      CHECK:     %[[READ_1:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x5x6xf32>, vector<1x5x6xf32>
+//      CHECK:     %[[WRITE_1:.*]] = vector.transfer_write %[[READ_1]], %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x5x6xf32>, tensor<1x12x13xf32>
 //      CHECK:     %[[VEC:.*]] = call @make_vector() : () -> tensor<12x13xf32>
-//      CHECK:     %[[RES:.*]] = tensor.insert_slice %[[VEC]] into %[[WRITE]][0, 0, 0] [1, 12, 13] [1, 1, 1] : tensor<12x13xf32> into tensor<1x12x13xf32>
+//      CHECK:     %[[READ_2:.*]] = vector.transfer_read %[[VEC]]{{\[}}%[[C0]], %[[C0]]], %[[PAD_READ]] {in_bounds = [true, true]} : tensor<12x13xf32>, vector<12x13xf32>
+//      CHECK:     %[[RES:.*]] = vector.transfer_write %[[READ_2]], %[[WRITE_1]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<12x13xf32>, tensor<1x12x13xf32>
 //      CHECK:     return %[[RES]] : tensor<1x12x13xf32>
 
 func.func @pad_and_insert_slice_dest(
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -253,3 +253,25 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// With dynamically shaped source, the vectorizer infers the vector size for
+// xfer Ops from the destination tensor and, conservatively, assumes
+// out-of-bounds accesses. Out-of-bounds accesses require a pad value, but
+// that's impossible to recover in this example. Hence the vectorization fails.
+
+func.func @insert_slice_default_pad(%arg0: tensor<1x?x3xf32>, %arg1: tensor<9x8x7x1x2x3xf32>, %size: index) -> tensor<9x8x7x1x2x3xf32> {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %res = tensor.insert_slice %arg0 into %arg1[0, 0, 0, 0, 0, 0] [1, 1, 1, 1, %size, 3][1, 1, 1, 1, 1, 1] : tensor<1x?x3xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -1935,17 +1935,80 @@ module attributes {transform.with_named_sequence} {
 /// tensor.insert_slice
 ///----------------------------------------------------------------------------------------
 
-// CHECK-LABEL: func @insert_slice
+// The pad value for xfer-read is neither needed nor available - use the default (0.0).
+
+// CHECK-LABEL: func @insert_static_slice_default_pad
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x2x3xf32>,
 // CHECK-SAME:      %[[ARG_1:.*]]: tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32> {
 // CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x2x3xf32>, vector<1x2x3xf32>
 // CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[ARG_1]]{{\[}}%[[C0]], %[[C0]], %[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
 // CHECK:           return %[[WRITE]] : tensor<9x8x7x1x2x3xf32>
-func.func @insert_slice(%arg0: tensor<1x2x3xf32>, %arg1: tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32> {
-  %0 = tensor.insert_slice %arg0 into %arg1[0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 2, 3][1, 1, 1, 1, 1, 1] : tensor<1x2x3xf32> into tensor<9x8x7x1x2x3xf32>
-  return %0 : tensor<9x8x7x1x2x3xf32>
+func.func @insert_static_slice_default_pad(%arg0: tensor<1x2x3xf32>, %arg1: tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32> {
+  %res = tensor.insert_slice %arg0 into %arg1[0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 2, 3][1, 1, 1, 1, 1, 1] : tensor<1x2x3xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// Same as above, but there's a pad value available that should be used instead of the default value.
+
+// CHECK-LABEL:   func.func @insert_static_slice_non_zero_pad
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x2x3xf32>,
+// CHECK-SAME:      %[[PAD:.*]]: f32) -> tensor<9x8x7x1x2x3xf32> {
+// CHECK:           %[[EMPTY:.*]] = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[BC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<9x8x7x1x2x3xf32>
+// CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[BC]], %[[EMPTY]]{{.*}} {in_bounds = [true, true, true, true, true, true]} : vector<9x8x7x1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{.*}}, %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x2x3xf32>, vector<1x2x3xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[WRITE]]{{.*}} {in_bounds = [true, true, true]} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           return %[[RES]] : tensor<9x8x7x1x2x3xf32>
+func.func @insert_static_slice_non_zero_pad(%arg0: tensor<1x2x3xf32>, %pad : f32) -> tensor<9x8x7x1x2x3xf32> {
+  %init = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+  %fill = linalg.fill ins(%pad : f32) outs(%init : tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32>
+  %res = tensor.insert_slice %arg0 into %fill[0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 2, 3][1, 1, 1, 1, 1, 1] : tensor<1x2x3xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// Same as above, but the source type has is dynamically shaped. This means
+// that the pad value is now required and the vector dim corresponding to the
+// dynamic shape has to be inferred from the shape of the destination tensor.
+
+// CHECK-LABEL:   func.func @insert_dynamic_slice_non_zero_pad(
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x?x3xf32>,
+// CHECK-SAME:      %[[PAD:.*]]: f32,
+// CHECK-SAME:      %[[SIZE:.*]]: index) -> tensor<9x8x7x1x2x3xf32> {
+// CHECK:           %[[EMPTY:.*]] = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[BC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<9x8x7x1x2x3xf32>
+// CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[BC]], %[[EMPTY]]{{.*}} {in_bounds = [true, true, true, true, true, true]} : vector<9x8x7x1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{.*}}, %[[PAD]] {in_bounds = [true, false, true]} : tensor<1x?x3xf32>, vector<1x2x3xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[WRITE]]{{.*}} {in_bounds = [true, true, true]} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           return %[[RES]] : tensor<9x8x7x1x2x3xf32>
+func.func @insert_dynamic_slice_non_zero_pad(%arg0: tensor<1x?x3xf32>, %pad : f32, %size: index) -> tensor<9x8x7x1x2x3xf32> {
+  %init = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+  %fill = linalg.fill ins(%pad : f32) outs(%init : tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32>
+  %res = tensor.insert_slice %arg0 into %fill[0, 0, 0, 0, 0, 0] [1, 1, 1, 1, %size, 3][1, 1, 1, 1, 1, 1] : tensor<1x?x3xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
 }
 
 module attributes {transform.with_named_sequence} {

Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,7 @@ void transform::ApplyFoldAddIntoDestPatternsOp::populatePatterns(`
`256`	`256`	`void transform::ApplyPadVectorizationPatternsOp::populatePatterns(`
`257`	`257`	`RewritePatternSet &patterns) {`
`258`	`258`	`linalg::populatePadOpVectorizationPatterns(patterns);`
	`259`	`+ linalg::populateInsertSliceVectorizationPatterns(patterns);`
`259`	`260`	`}`
`260`	`261`
`261`	`262`	`//===----------------------------------------------------------------------===//`