llvm · linuxlonelyeagle · Feb 17, 2025 · Feb 22, 2025 · Feb 26, 2025 · Feb 28, 2025
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// In the GPU, the number of trip of each thread in the loop is inconsistent.
+/// This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.

diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
 /// was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
 
+/// Eliminate loops that will never actually execute.
+LogicalResult removeInvalidLoop(AffineForOp forOp);
+
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
 void promoteSingleIterationLoops(func::FuncOp f);

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
+
+    /// Find BlockSize via the BlockArgument of gpu.launch.
+    Value getBlockSizeOnAxis(Value threadId);
+
+    ///  Find BlockSize via the Dimension Information.
+    Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
   let hasCanonicalizer = 1;

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+                               SmallVectorImpl<Value> &operands,
+                               SmallVectorImpl<AffineExpr> &symReplacements,
+                               unsigned numDim, bool replaceWithZero = false) {
+  auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+  if (!launchOp)
+    return;
+
+  // `b` is only used to create `AffineExpr`.
+  Builder b(forOp.getContext());
+  unsigned idx = 0;
+
+  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+    Value operand = operands[i];
+    if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+      operands[i] = blockSize;
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp) {
+      ++idx;
+      continue;
+    }
+
+    if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+      gpu::Dimension dimension = threadIdOp.getDimension();
+      operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+    ++idx;
+  }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+    if (!constExpr)
+      return std::nullopt;
+    if (tripCount.has_value())
+      tripCount =
+          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+    else
+      tripCount = constExpr.getValue();
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");

diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
@@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
 }
 
+/// Eliminate loops that will never actually execute
+LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
+  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
+    return failure();
+
+  auto iterOperands = forOp.getInits();
+  auto results = forOp.getResults();
+  for (auto [result, operand] : llvm::zip(results, iterOperands))
+    result.replaceAllUsesWith(operand);
+
+  IRRewriter b(forOp);
+  b.eraseOp(forOp);
+  return success();
+}
+
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // The values of Trip are all 0, and the invalid loop is deleted.
+  if (tripCount <= 0 && maxTripCount <= 0)
+    return removeInvalidLoop(forOp);
+
+  // In special cases, such as in a GPU, only some threads execute this loop.
+  if (tripCount == 0 && maxTripCount == 1)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+  if (dimension == Dimension::x)
+    return getBlockSizeX();
+  else if (dimension == Dimension::y)
+    return getBlockSizeY();
+  else
+    return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+  KernelDim3 threadIds = getThreadIds();
+  if (threadIds.x == threadId)
+    return getBlockSizeX();
+  else if (threadIds.y == threadId)
+    return getBlockSizeY();
+  else if (threadIds.z == threadId)
+    return getBlockSizeZ();
+  return {};
+}
+
 LogicalResult LaunchOp::verify() {
   if (!(hasClusterSize()) &&
       (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))