llvm · linuxlonelyeagle · Feb 17, 2025 · Feb 22, 2025 · Feb 26, 2025 · Feb 28, 2025
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,11 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
+std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,8 @@
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -212,6 +214,40 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
+                       presburger::BoundType type) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    AffineMap subMap =
+        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    ValueBoundsConstraintSet::Variable var(subMap, operands);
+    auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
+        mlir::presburger::BoundType::LB, var);
+    auto ubBound = ValueBoundsConstraintSet::computeConstantBound(
+        mlir::presburger::BoundType::UB, var, nullptr, true);
+    if (failed(lbBound) || failed(ubBound))
+      return std::nullopt;
+    if (type == presburger::BoundType::LB) {
+      if (tripCount.has_value())
+        tripCount =
+            std::min(*tripCount, static_cast<uint64_t>(lbBound.value()));
+      else
+        tripCount = lbBound.value();
+    } else if (type == presburger::BoundType::UB) {
+      if (tripCount.has_value())
+        tripCount =
+            std::min(*tripCount, static_cast<uint64_t>(ubBound.value()));
+      else
+        tripCount = ubBound.value();
+    } else {
+      return std::nullopt;
+    }
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -223,20 +259,21 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  return getKnownTripCountBound(map, operands, presburger::BoundType::LB);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
+std::optional<uint64_t>
+mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  return getKnownTripCountBound(map, operands, presburger::BoundType::UB);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -256,8 +293,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
   std::optional<uint64_t> gcd;
   for (auto resultExpr : map.getResults()) {
     uint64_t thisGcd;
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      uint64_t tripCount = constExpr.getValue();
+    AffineMap subMap =
+        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    ValueBoundsConstraintSet::Variable var(subMap, operands);
+    auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
+        mlir::presburger::BoundType::LB, var);
+    if (!failed(lbBound)) {
+      uint64_t tripCount = lbBound.value();
       // 0 iteration loops (greatest divisor is 2^64 - 1).
       if (tripCount == 0)
         thisGcd = std::numeric_limits<uint64_t>::max();

diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -116,8 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
-  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> minTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
+  if (!minTripCount || *minTripCount != 1 || !maxTripCount ||
+      *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +162,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +887,23 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getUpperBoundOnTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // Trip equals 0, this loop cannot unroll.
+  if (tripCount <= 0)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1024,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getUpperBoundOnTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();

@@ -646,7 +646,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   // Compute constant bound for `valueDim`.
   int64_t ubAdjustment = closedUB ? 0 : 1;
   if (auto bound = cstr.cstr.getConstantBound64(type, pos))
-    return type == BoundType::UB ? *bound + ubAdjustment : *bound;
+    if (bound.has_value())
+      return type == BoundType::UB ? *bound + ubAdjustment : *bound;
   return failure();
 }
 

diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -258,6 +259,71 @@ gpu.module @unroll_full {
   }
 }
 
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL-NEXT:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL-NEXT: }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL-NEXT:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL-NEXT: }
+  return
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -701,6 +767,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4-NEXT: }
+  return
+}
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0