Amending amdgpu transfer-read to use new linearized size

jerryyin · jerryyin · commit cde70c4629d2 · 2025-05-07T19:53:12.000Z
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
@@ -162,60 +162,20 @@ struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
         stridedMetadata.getConstifiedMixedStrides();
     SmallVector<OpFoldResult> sizes = stridedMetadata.getConstifiedMixedSizes();
     OpFoldResult offset = stridedMetadata.getConstifiedMixedOffset();
+    memref::LinearizedMemRefInfo linearizedInfo;
     OpFoldResult linearizedIndices;
-    std::tie(std::ignore, linearizedIndices) =
+    std::tie(linearizedInfo, linearizedIndices) =
         memref::getLinearizedMemRefOffsetAndSize(rewriter, loc, elementBitWidth,
                                                  elementBitWidth, offset, sizes,
                                                  strides, indices);
 
-    // TODO(jerryyin): Fix the getLinearizedMemRefOffsetAndSize() function
-    // Note below doesn't give the correct result for the linearized size.
-    // Value totalSize = getValueOrCreateConstantIndexOp(
-    //    rewriter, loc, linearizedInfo.linearizedSize);
-    // It computes the multiplied sizes of all dimensions instead of taking
-    // the maximum of each dimension size * stride.
-    SmallVector<AffineExpr> productExpressions;
-    unsigned sourceRank = cast<ShapedType>(src.getType()).getRank();
-
-    SmallVector<AffineExpr> symbols(2 * sourceRank);
-    SmallVector<Value> offsetValues;
-    bindSymbolsList(rewriter.getContext(), MutableArrayRef{symbols});
-
-    size_t symbolIndex = 0;
-    for (size_t i = 0; i < sourceRank; ++i) {
-      AffineExpr strideExpr, sizeExpr;
-      OpFoldResult stride = strides[i];
-      OpFoldResult size = sizes[i];
-      if (auto constantStride = getConstantIntValue(stride)) {
-        strideExpr = rewriter.getAffineConstantExpr(*constantStride);
-      } else {
-        strideExpr = symbols[symbolIndex++];
-        offsetValues.push_back(
-            getValueOrCreateConstantIndexOp(rewriter, loc, stride));
-      }
-
-      if (auto constantSize = getConstantIntValue(size)) {
-        sizeExpr = rewriter.getAffineConstantExpr(*constantSize);
-      } else {
-        sizeExpr = symbols[symbolIndex++];
-        offsetValues.push_back(
-            getValueOrCreateConstantIndexOp(rewriter, loc, size));
-      }
-
-      productExpressions.push_back(strideExpr * sizeExpr);
-    }
-
-    AffineMap maxMap = AffineMap::get(
-        /*dimCount=*/0, /*symbolCount=*/symbolIndex, productExpressions,
-        rewriter.getContext());
-    Value totalSize =
-        rewriter.create<affine::AffineMaxOp>(loc, maxMap, offsetValues);
-
     // delta = bufferSize - linearizedOffset
     Value vectorSizeOffset =
         rewriter.create<arith::ConstantIndexOp>(loc, vectorSize);
     Value linearIndex =
         getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices);
+    Value totalSize = getValueOrCreateConstantIndexOp(
+        rewriter, loc, linearizedInfo.linearizedSize);
     Value delta = rewriter.create<arith::SubIOp>(loc, totalSize, linearIndex);
 
     // 1) check if delta < vectorSize
diff --git a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
@@ -52,9 +52,9 @@ func.func @transfer_to_maskedload_fatrawbuffer_f16(%mem : memref<8x8xf16, #amdgp
 
 // -----
 
-// CHECK: #map = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
-// CHECK: #map1 = affine_map<()[s0, s1, s2] -> (s0 * s1, s2)>
-// CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(
+// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * s1, s2)>
+// CHECK: func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xi8, #amdgpu.address_space<fat_raw_buffer>>
 // CHECK-SAME: %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
 // CHECK-SAME: %[[ARG3:.*]]: vector<4xi1>
@@ -68,8 +68,8 @@ func.func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(%mem : memref<?x?xi8,
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[C4:.*]] = arith.constant 4 : index
 // CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG0]]
-// CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[STRIDES]]#0, %[[ARG2]]]
-// CHECK: %[[SIZE:.*]] = affine.max #map1()[%[[STRIDES]]#0, %[[SIZES]]#0, %[[SIZES]]#1]
+// CHECK: %[[SIZE:.*]] = affine.max #[[MAP1]]()[%[[STRIDES]]#0, %[[SIZES]]#0, %[[SIZES]]#1]
+// CHECK: %[[LINEAR:.*]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[STRIDES]]#0, %[[ARG2]]]
 // CHECK: %[[IF:.*]] = scf.if
 // CHECK: return