simplified to support single nested scf.for

Yun-Fly · Yun-Fly · commit 0b9355bb3780 · 2024-09-08T21:57:28.000-07:00
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1464,38 +1464,12 @@ checkAssumptionForFusingConsumer(tensor::InsertSliceOp candidateSliceOp) {
 /// failure otherwise.
 static FailureOr<OpOperand *> getConsumerFromUses(Value val,
                                                   Block *containingOpBlock) {
-  // Step 1. Check that the value has exactly one use excluding `insertSliceOp`
-  // or `ParallelInsertSliceOp`.
-  OpOperand *operand = nullptr;
-  for (auto &use : val.getUses()) {
-    Operation *user = use.getOwner();
-    if (isa<tensor::ParallelInsertSliceOp>(user))
-      continue;
-    if (isa<tensor::InsertSliceOp>(user)) {
-      // The only one use is expected as dummy extractSliceOp without any uses.
-      // For more details, please refer to:
-      // https://github.com/llvm/llvm-project/pull/88712#discussion_r1609384470
-      if (user->hasOneUse()) {
-        if (auto extractOp =
-                dyn_cast<tensor::ExtractSliceOp>(*user->getUsers().begin());
-            extractOp && extractOp->use_empty()) {
-          // Erase dummy extractSliceOp.
-          extractOp.erase();
-          // DO NOT erase `user` inside iteration of `getUses`.
-          user->moveBefore(&containingOpBlock->getOperations().back());
-          continue;
-        }
-      }
-      // Otherwise return.
-      return failure();
-    }
-    // Only one valid use expected
-    if (operand)
-      return failure();
-    operand = &use;
-  }
+  // Step 1. Check that the value has exactly one use.
+  if (!llvm::hasSingleElement(val.getUses()))
+    return failure();
   // Step 2. Get uses.
-  Operation *consumerOp = operand->getOwner();
+  OpOperand &operand = (*val.getUses().begin());
+  Operation *consumerOp = operand.getOwner();
   // TODO: We have to init result of consumer before scf.for, use
   //       DestinationStyleOpInterface to get result shape from init for now.
   //       Add support for other op such as op has InferTypeOpInterface.
@@ -1504,7 +1478,7 @@ static FailureOr<OpOperand *> getConsumerFromUses(Value val,
     return failure();
   if (containingOpBlock != consumerOp->getBlock())
     return failure();
-  return operand;
+  return &operand;
 }
 
 /// Recursively find the outer nest loops of given loop(included) while the
@@ -1693,9 +1667,9 @@ fixTerminatorSCFInParallel(RewriterBase &rewriter, scf::ForallOp newForallOp,
 
 /// Implementation of fusing consumer of a single slice by computing the
 /// slice of the consumer in-place for scf loop.
-static FailureOr<scf::SCFFuseConsumerOfSliceResult>
-tileAndFuseConsumerOfSliceImpl(RewriterBase &rewriter,
-                               Operation *candidateSliceOp) {
+FailureOr<scf::SCFFuseConsumerOfSliceResult>
+mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter,
+                                      Operation *candidateSliceOp) {
   if (!isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
           candidateSliceOp))
     return failure();
@@ -1742,7 +1716,7 @@ tileAndFuseConsumerOfSliceImpl(RewriterBase &rewriter,
   // 2. inner-most `scf.for` insider nest `scf.loop` structure, where the
   // top-level loop is the outer-most one of these nested loops.
   Operation *oldTopLevelLoop = oldLoopOp;
-  SmallVector<LoopLikeOpInterface> oldNestedForOps, newNestedForOps;
+  SmallVector<LoopLikeOpInterface> oldNestedForOps;
   if (isInsertSliceOp) {
     oldNestedForOps =
         getOuterNestLoopsWhile(cast<LoopLikeOpInterface>(oldTopLevelLoop),
@@ -1781,6 +1755,7 @@ tileAndFuseConsumerOfSliceImpl(RewriterBase &rewriter,
   // 3.a Create new outer scf loops with new Inits only if nested `scf.for`
   // case was found.
   bool isNestedForOps = isInsertSliceOp && oldNestedForOps.size() > 1;
+  SmallVector<LoopLikeOpInterface> newNestedForOps;
   if (isNestedForOps) {
     for (auto &&[index, loopOp] :
          llvm::enumerate(MutableArrayRef(oldNestedForOps).drop_back())) {
@@ -1979,110 +1954,6 @@ tileAndFuseConsumerOfSliceImpl(RewriterBase &rewriter,
       tileAndFuseResult->tiledOps};
 }
 
-/// Get the real consumers from candidate InsertSliceOp. E.g
-///
-/// ```
-/// %1 = scf.for
-///  %2 = scf.for
-///   %3 = scf.for
-///      ...
-///      %4 = insert
-///      yield %4
-///   %5 = insert %3
-///   yield %5
-///  yield %2
-/// %6 = consumerOp ins(%1)
-/// ```
-///
-/// @param candidateSliceOp: %4 = insert
-/// @param forwardSlice: in-out parameter populated by forward insertSliceOps
-/// @return OpOperand consumers: %6 = consumerOp ins(%1)
-static FailureOr<SmallVector<OpOperand *>> getRealConsumersFromInsertSliceOp(
-    Operation *candidateSliceOp,
-    SmallVector<OffsetSizeAndStrideOpInterface> &forwardSlice,
-    unsigned curDepth = 0, unsigned maxDepth = 5) {
-  if (!isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
-          candidateSliceOp))
-    return failure();
-  // Control recursive time in avoid of stack overflow
-  if (curDepth > maxDepth)
-    return failure();
-
-  forwardSlice.push_back(
-      cast<OffsetSizeAndStrideOpInterface>(candidateSliceOp));
-  Value resultOfLoop;
-  if (auto sliceOp =
-          dyn_cast<tensor::ParallelInsertSliceOp>(candidateSliceOp)) {
-    Value destValue = sliceOp.getDest();
-    auto iterArg = cast<BlockArgument>(destValue);
-    auto forallOp = dyn_cast<scf::ForallOp>(iterArg.getOwner()->getParentOp());
-    if (!forallOp)
-      return failure();
-    resultOfLoop = forallOp.getTiedOpResult(forallOp.getTiedOpOperand(iterArg));
-  } else if (auto sliceOp = dyn_cast<tensor::InsertSliceOp>(candidateSliceOp)) {
-    Value resultValue = sliceOp.getResult();
-    for (auto &useOperand : resultValue.getUses()) {
-      if (auto yieldOp = dyn_cast<scf::YieldOp>(useOperand.getOwner())) {
-        if (llvm::detail::isPresent(resultOfLoop))
-          return failure();
-        auto forOp = dyn_cast<LoopLikeOpInterface>(yieldOp->getParentOp());
-        if (!forOp)
-          return failure();
-        resultOfLoop = forOp->getResult(useOperand.getOperandNumber());
-      }
-    }
-  }
-
-  if (!llvm::detail::isPresent(resultOfLoop))
-    return failure();
-
-  bool traverseUpperLoop;
-  do {
-    traverseUpperLoop = false;
-    for (OpOperand &useOperand : resultOfLoop.getUses()) {
-      if (auto sliceOp =
-              dyn_cast<OffsetSizeAndStrideOpInterface>(useOperand.getOwner())) {
-        return getRealConsumersFromInsertSliceOp(sliceOp, forwardSlice,
-                                                 curDepth + 1);
-      }
-      if (auto yieldOp = dyn_cast<scf::YieldOp>(useOperand.getOwner())) {
-        // Walk through outer loop.
-        auto forOp = dyn_cast<LoopLikeOpInterface>(yieldOp->getParentOp());
-        if (!forOp)
-          return failure();
-        resultOfLoop = forOp->getResult(useOperand.getOperandNumber());
-        traverseUpperLoop = true;
-        break;
-      }
-    }
-  } while (traverseUpperLoop);
-  // Return all operands using result of top level loop.
-  return llvm::map_to_vector(resultOfLoop.getUses(),
-                             [](OpOperand &u) -> OpOperand * { return &u; });
-}
-
-/// Fusing real consumer of a single slice even within complex nested loops via
-/// multiple application of `tileAndFuseConsumerOfSliceImpl`.
-FailureOr<scf::SCFFuseConsumerOfSliceResult>
-mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter,
-                                      Operation *candidateSliceOp) {
-  SmallVector<OffsetSizeAndStrideOpInterface> forwardSlice;
-  if (failed(getRealConsumersFromInsertSliceOp(candidateSliceOp, forwardSlice)))
-    return failure();
-
-  FailureOr<scf::SCFFuseConsumerOfSliceResult> fuseConsumerResult;
-  // Reverse forward slice from outer to inner.
-  std::reverse(forwardSlice.begin(), forwardSlice.end());
-  // Multiple application of `tileAndFuseConsumerOfSliceImpl`.
-  for (auto &sliceOp : forwardSlice) {
-    fuseConsumerResult = tileAndFuseConsumerOfSliceImpl(rewriter, sliceOp);
-    if (failed(fuseConsumerResult))
-      return rewriter.notifyMatchFailure(sliceOp,
-                                         "could not fuse consumer of sliceOp");
-  }
-  return fuseConsumerResult;
-}
-
 //===----------------------------------------------------------------------===//
 // lowerToLoopsUsingSCFForOp implementation.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -374,38 +374,27 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-#map = affine_map<(d0) -> (d0 * 128)>
 module {
-  func.func @fuse_tilable_consumer_nested_scf_loop(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> tensor<256x256xf32> {
+  func.func @fuse_add_consumer_into_nested_scf_for(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> tensor<256x256xf32> {
     %c0 = arith.constant 0 : index
     %c64 = arith.constant 64 : index
-    %c128 = arith.constant 128 : index
+    %c256 = arith.constant 256 : index
     %cst = arith.constant 0.000000e+00 : f32
     %dest0 = tensor.empty() : tensor<256x256xf32>
     %dest1 = linalg.fill ins(%cst : f32) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %dest1) -> tensor<256x256xf32> {
-      %iv0 = affine.apply #map(%arg3)
-      %iv1 = affine.apply #map(%arg4)
-      %extracted_slice_1 = tensor.extract_slice %arg5[%iv0, %iv1] [128, 128] [1, 1] : tensor<256x256xf32> to tensor<128x128xf32>
-      %extracted_slice_2 = tensor.extract_slice %arg0[%iv0, 0] [128, 512] [1, 1] : tensor<256x512xf32> to tensor<128x512xf32>
-      %extracted_slice_3 = tensor.extract_slice %arg1[0, %iv1] [512, 128] [1, 1] : tensor<512x256xf32> to tensor<512x128xf32>
-      %2 = scf.for %arg6 = %c0 to %c128 step %c64 iter_args(%arg7 = %extracted_slice_1) -> (tensor<128x128xf32>) {
-        %3 = scf.for %arg8 = %c0 to %c128 step %c64 iter_args(%arg9 = %arg7) -> (tensor<128x128xf32>) {
-          %extracted_slice_4 = tensor.extract_slice %arg9[%arg6, %arg8] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32>
-          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg6, 0] [64, 512] [1, 1] : tensor<128x512xf32> to tensor<64x512xf32>
-          %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg8] [512, 64] [1, 1] : tensor<512x128xf32> to tensor<512x64xf32>
-          %4 = linalg.matmul ins(%extracted_slice_5, %extracted_slice_6 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice_4 : tensor<64x64xf32>) -> tensor<64x64xf32>
-          %insert_slice = tensor.insert_slice %4 into %arg9[%arg6, %arg8] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32>
-          scf.yield %insert_slice : tensor<128x128xf32>
-        }
-        scf.yield %3 : tensor<128x128xf32>
-      }
-      scf.forall.in_parallel {
-         tensor.parallel_insert_slice %2 into %arg5[%iv0, %iv1] [128, 128] [1, 1] : tensor<128x128xf32> into tensor<256x256xf32>
+    %1 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %dest1) -> (tensor<256x256xf32>) {
+      %2 = scf.for %arg5 = %c0 to %c256 step %c64 iter_args(%arg6 = %arg4) -> (tensor<256x256xf32>) {
+        %extracted_slice_1 = tensor.extract_slice %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
+        %extracted_slice_2 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32>
+        %extracted_slice_3 = tensor.extract_slice %arg1[0, %arg5] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32>
+        %3 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
+        %insert_slice = tensor.insert_slice %3 into %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
+        scf.yield %insert_slice : tensor<256x256xf32>
       }
+      scf.yield %2 : tensor<256x256xf32>
     }
-    %5 = linalg.add ins(%1, %arg2 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32>
-    return %5 : tensor<256x256xf32>
+    %4 = linalg.add ins(%1, %arg2 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+    return %4 : tensor<256x256xf32>
   }
 }
 
@@ -418,49 +407,33 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//      CHECK: #[[MAP0:.*]] =  affine_map<(d0) -> (d0 * 128)>
-//      CHECK: func.func @fuse_tilable_consumer_nested_scf_loop(
+//      CHECK: func.func @fuse_add_consumer_into_nested_scf_for(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32>
 //      CHECK:   %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32>
 //      CHECK:   %[[dest1:.*]] = linalg.fill
 // CHECK-SAME:          outs(%[[dest0]] :
-//      CHECK:   %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2)
-// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG0:.*]] = %[[dest1]], %[[SECOND_OUT_ARG0:.*]] = %[[dest0]])
+//      CHECK:   %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV1:.*]] = %[[C0]]
+// CHECK-SAME:       iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[dest1]], %[[SECOND_OUT_ARG1:.*]] = %[[dest0]])
 // CHECK-SAME:   {
-//      CHECK:      %[[AFFINE_IV1:.*]] = affine.apply #[[MAP0]](%[[IV1]])
-//      CHECK:      %[[AFFINE_IV2:.*]] = affine.apply #[[MAP0]](%[[IV2]])
-//      CHECK:      %[[MAT_OUT_SLICE0:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG0]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
-//      CHECK:      %[[INPUT_SLICE0:.*]] = tensor.extract_slice %[[ARG0]][%[[AFFINE_IV1]], 0] [128, 512] [1, 1]
-//      CHECK:      %[[WEIGHT_SLICE0:.*]] = tensor.extract_slice %[[ARG1]][0, %[[AFFINE_IV2]]] [512, 128] [1, 1]
-//      CHECK:      %[[ADD_OPERAND2_SLICE0:.*]] = tensor.extract_slice %[[ARG2]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
-//      CHECK:      %[[ADD_OUT_SLICE0:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG0]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
-//      CHECK:      %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV3:.*]] = %[[C0]]
-// CHECK-SAME:          iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[MAT_OUT_SLICE0]], %[[SECOND_OUT_ARG1:.*]] = %[[ADD_OUT_SLICE0]])
-// CHECK-SAME:      {
-//      CHECK:          %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV4:.*]] = %[[C0]]
-// CHECK-SAME:            iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[SECOND_OUT_ARG1]])
-// CHECK-SAME:          {
-//      CHECK:            %[[MAT_OUT_SLICE1:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
-//      CHECK:            %[[INPUT_SLICE1:.*]] = tensor.extract_slice %[[INPUT_SLICE0]][%[[IV3]], 0] [64, 512] [1, 1]
-//      CHECK:            %[[WEIGHT_SLICE1:.*]] = tensor.extract_slice %[[WEIGHT_SLICE0]][0, %[[IV4]]] [512, 64] [1, 1]
+//      CHECK:       %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV2:.*]] = %[[C0]]
+// CHECK-SAME:         iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[SECOND_OUT_ARG1]])
+// CHECK-SAME:         {
+//      CHECK:            %[[MAT_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[INPUT_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 512] [1, 1]
+//      CHECK:            %[[WEIGHT_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[IV2]]] [512, 64] [1, 1]
 //      CHECK:            %[[TILED_MAT_OUT:.*]] = linalg.matmul
-// CHECK-SAME:                  outs(%[[MAT_OUT_SLICE1]] :
-//      CHECK:            %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
-//      CHECK:            %[[ADD_OPERAND2_SLICE1:.*]] = tensor.extract_slice %[[ADD_OPERAND2_SLICE0]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
-//      CHECK:            %[[ADD_OUT_SLICE1:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
+// CHECK-SAME:                  outs(%[[MAT_OUT_SLICE]] :
+//      CHECK:            %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[ADD_OPERAND2_SLICE:.*]] = tensor.extract_slice %[[ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
 //      CHECK:            %[[TILED_ADD_OUT:.*]] = linalg.add
-// CHECK-SAME:              ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE1]] :
-// CHECK-SAME:              outs(%[[ADD_OUT_SLICE1]] :
-//      CHECK:            %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
+// CHECK-SAME:              ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE]] :
+// CHECK-SAME:              outs(%[[ADD_OUT_SLICE]] :
+//      CHECK:            %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
 //      CHECK:            scf.yield %[[INSERT_MAT]], %[[INSERT_ADD]] :
-//      CHECK:          }
-//      CHECK:          scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 :
-//      CHECK:      }
-//      CHECK:      scf.forall.in_parallel {
-//      CHECK:          tensor.parallel_insert_slice %[[LOOP_RESULT1]]#1 into %[[SECOND_OUT_ARG0]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
-//      CHECK:          tensor.parallel_insert_slice %[[LOOP_RESULT1]]#0 into %[[FIRST_OUT_ARG0]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
-//      CHECK:       }
+//      CHECK:         }
+//      CHECK:         scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 :
 //      CHECK:   }
-//      CHECK:   return %[[FINAL_RESULT]]#1 :
+//      CHECK:   return %[[LOOP_RESULT1]]#1 :