Add pattern to fold insert_slice of extract_slice

Jerry Wu · Jerry Wu · commit dceb7b78f571 · 2024-03-22T21:09:09.000Z
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -67,6 +67,7 @@ class InsertSliceOfTransferWriteOpFolder final
                                 PatternRewriter &rewriter) const override;
 };
 
+/// Merge insert_slice operation with extract_slice operation.
 class InsertSliceOfExtractSliceFolder final
     : public OpRewritePattern<tensor::InsertSliceOp> {
 public:
@@ -158,41 +159,69 @@ LogicalResult InsertSliceOfTransferWriteOpFolder::matchAndRewrite(
   return success();
 }
 
+/// Merge insert_slice operation with extract_slice operation.
+///
+/// This can be done when the insert_slice op purely expands ranks (adds unit
+/// dims) and the extrace_slice drops corresponding unit dims. For example:
+///
+/// %extracted_slice = tensor.extract_slice %in[0, 0] [1, 8] [1, 1]
+///     : tensor<2x8xf32> to tensor<8xf32>
+/// %inserted_slice = tensor.insert_slice %extracted_slice
+///     into %dest[0, 0] [1, 8] [1, 1]
+///     : tensor<8xf32> into tensor<1x8xf32>
+///
+/// can be folded into:
+///
+/// %extracted_slice = tensor.extract_slice %in[0, 0] [1, 8] [1, 1]
+///     : tensor<2x8xf32> to tensor<1x8xf32>
 LogicalResult InsertSliceOfExtractSliceFolder::matchAndRewrite(
     tensor::InsertSliceOp insertSliceOp, PatternRewriter &rewriter) const {
   auto extractSliceOp =
       insertSliceOp.getSource().getDefiningOp<tensor::ExtractSliceOp>();
   if (!extractSliceOp)
     return failure();
 
+  // Can't fold if the extract_slice op has other users.
   if (!extractSliceOp->hasOneUse())
     return failure();
 
+  // Check if the insert_slice op purely expands ranks (add unit dims).
   if (!isCastLikeInsertSliceOp(insertSliceOp))
     return failure();
 
   llvm::SmallBitVector extractDroppedDims = extractSliceOp.getDroppedDims();
   llvm::SmallBitVector insertExpandedDims = insertSliceOp.getDroppedDims();
+  // Can't fold if the insert_slice op expands to more dims.
   if (extractDroppedDims.size() < insertExpandedDims.size())
     return failure();
 
-  int64_t insertPos = 0;
-  for (int64_t extractPos = 0; extractPos < extractDroppedDims.size();
-       ++extractPos) {
-    if (insertPos == insertExpandedDims.size())
+  // Try to match the dropped unit dims to the expanded unit dims. This is done
+  // by scanning the dims of extract_slice and find the left-most one can match
+  // the dim of insert_slice. If a match is found, advance the dim of
+  // insert_slice to match the next one.
+  unsigned insertDimPos = 0;
+  for (unsigned extractDimPos = 0; extractDimPos < extractDroppedDims.size();
+       ++extractDimPos) {
+    // Matched all expanded dims.
+    if (insertDimPos == insertExpandedDims.size())
       break;
 
-    bool isDropped = extractDroppedDims[extractPos];
-    bool isExpanded = insertExpandedDims[insertPos];
+    bool isDropped = extractDroppedDims[extractDimPos];
+    bool isExpanded = insertExpandedDims[insertDimPos];
+    // Match if both sides drop/keep the dim. Advance and match the next dim of
+    // insert_slice.
     if (isDropped == isExpanded) {
-      insertPos += 1;
-    } else {
-      if (!isDropped && isExpanded) {
-        return failure();
-      }
+      insertDimPos += 1;
+    } else if (!isDropped && isExpanded) {
+      // Not enough dropped unit dims to match the expanded unit dims.
+      return failure();
     }
+    // If the dim is dropped by extract_slice and not by insert_slice, look the
+    // next dim of extract_slice to see if it can match the current dim of
+    // insert_slice.
   }
-  if (insertPos != insertExpandedDims.size())
+  // Can't match some expanded dims.
+  if (insertDimPos != insertExpandedDims.size())
     return failure();
 
   rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -147,6 +147,8 @@ bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) {
   // same size.
   for (int64_t resultDim = 0; resultDim < resultType.getRank(); ++resultDim) {
     if (droppedDims.test(resultDim)) {
+      // InsertSlice may expand unit dimensions that result from inserting a
+      // size-1 slice into a non-size-1 result dimension.
       if (resultType.getDimSize(resultDim) != 1)
         return false;
       continue;
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
@@ -390,3 +390,68 @@ func.func @parallel_insert_slice_of_insert_slice_dynamic(
   }
   return %0: tensor<12x34xf32>
 }
+
+// -----
+
+func.func @fold_casting_insert_slice_of_extract_slice(%in : tensor<?x8x2x8xf32>, %dest : tensor<8x1x8xf32>) -> tensor<8x1x8xf32> {
+  %extracted_slice = tensor.extract_slice %in[0, 0, 0, 0] [1, 8, 1, 8] [1, 1, 1, 1] : tensor<?x8x2x8xf32> to tensor<8x8xf32>
+  %inserted_slice = tensor.insert_slice %extracted_slice into %dest[0, 0, 0] [8, 1, 8] [1, 1, 1] : tensor<8x8xf32> into tensor<8x1x8xf32>
+  return %inserted_slice : tensor<8x1x8xf32>
+}
+// CHECK-LABEL: func.func @fold_casting_insert_slice_of_extract_slice(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x8x2x8xf32>
+// CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, 0, 0] [1, 8, 1, 8] [1, 1, 1, 1]
+// CHECK-SAME:      : tensor<?x8x2x8xf32> to tensor<8x1x8xf32>
+// CHECK:         return %[[EXTRACTED_SLICE]] : tensor<8x1x8xf32>
+
+// -----
+
+func.func @fold_casting_insert_slice_of_strided_extract_slice(%in : tensor<?x8x2x8xf32>, %dest : tensor<1x4x8xf32>) -> tensor<1x4x8xf32> {
+  %extracted_slice = tensor.extract_slice %in[0, 0, 0, 0] [1, 4, 1, 8] [1, 2, 1, 1] : tensor<?x8x2x8xf32> to tensor<4x8xf32>
+  %inserted_slice = tensor.insert_slice %extracted_slice into %dest[0, 0, 0] [1, 4, 8] [1, 1, 1] : tensor<4x8xf32> into tensor<1x4x8xf32>
+  return %inserted_slice : tensor<1x4x8xf32>
+}
+// CHECK-LABEL: func.func @fold_casting_insert_slice_of_strided_extract_slice(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x8x2x8xf32>
+// CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, 0, 0] [1, 4, 1, 8] [1, 2, 1, 1]
+// CHECK-SAME:      : tensor<?x8x2x8xf32> to tensor<1x4x8xf32>
+// CHECK:         return %[[EXTRACTED_SLICE]] : tensor<1x4x8xf32>
+
+// -----
+
+func.func @no_fold_more_unit_dims_insert_slice_of_extract_slice(%in : tensor<?x8x8xf32>, %dest : tensor<1x1x8x8xf32>) -> tensor<1x1x8x8xf32> {
+  %extracted_slice = tensor.extract_slice %in[0, 0, 0] [1, 8, 8] [1, 1, 1] : tensor<?x8x8xf32> to tensor<8x8xf32>
+  %inserted_slice = tensor.insert_slice %extracted_slice into %dest[0, 0, 0, 0] [1, 1, 8, 8] [1, 1, 1, 1] : tensor<8x8xf32> into tensor<1x1x8x8xf32>
+  return %inserted_slice : tensor<1x1x8x8xf32>
+}
+// CHECK-LABEL: func.func @no_fold_more_unit_dims_insert_slice_of_extract_slice(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x8x8xf32>
+// CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:         %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[EXTRACTED_SLICE]]
+// CHECK:         return %[[INSERTED_SLICE]] : tensor<1x1x8x8xf32>
+
+// -----
+
+func.func @no_fold_strided_insert_slice_of_extract_slice(%in : tensor<?x8x2x8xf32>, %dest : tensor<1x4x4xf32>) -> tensor<1x4x4xf32> {
+  %extracted_slice = tensor.extract_slice %in[0, 0, 0, 0] [1, 8, 1, 8] [1, 1, 1, 1] : tensor<?x8x2x8xf32> to tensor<8x8xf32>
+  %inserted_slice = tensor.insert_slice %extracted_slice into %dest[0, 0, 0] [1, 8, 8] [1, 2, 2] : tensor<8x8xf32> into tensor<1x4x4xf32>
+  return %inserted_slice : tensor<1x4x4xf32>
+}
+// CHECK-LABEL: func.func @no_fold_strided_insert_slice_of_extract_slice(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x8x2x8xf32>
+// CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:         %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[EXTRACTED_SLICE]]
+// CHECK:         return %[[INSERTED_SLICE]] : tensor<1x4x4xf32>
+
+// -----
+
+func.func @no_fold_non_casting_insert_slice_of_extract_slice(%in : tensor<1x1x1x8x8xf32>, %dest : tensor<2x8x8xf32>) -> tensor<2x8x8xf32> {
+  %extracted_slice = tensor.extract_slice %in[0, 0, 0, 0, 0] [1, 1, 1, 8, 8] [1, 1, 1, 1, 1] : tensor<1x1x1x8x8xf32> to tensor<8x8xf32>
+  %inserted_slice = tensor.insert_slice %extracted_slice into %dest[0, 0, 0] [1, 8, 8] [1, 1, 1] : tensor<8x8xf32> into tensor<2x8x8xf32>
+  return %inserted_slice : tensor<2x8x8xf32>
+}
+// CHECK-LABEL: func.func @no_fold_non_casting_insert_slice_of_extract_slice(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x1x1x8x8xf32>
+// CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:         %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[EXTRACTED_SLICE]]
+// CHECK:         return %[[INSERTED_SLICE]] : tensor<2x8x8xf32>