Skip to content

[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis #128113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);

/// Returns the maximum trip count when the operand of forOp has a range. If the
/// operand of forOp is a constant, the return value is the same as
/// `getConstantTripCount`.
std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);

/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
Expand Down
72 changes: 57 additions & 15 deletions mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
#include "llvm/Support/MathExtras.h"

#include "llvm/ADT/DenseSet.h"
Expand Down Expand Up @@ -212,6 +214,40 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}

/// Take the min if all trip counts are constant.
static std::optional<uint64_t>
getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
presburger::BoundType type) {
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
AffineMap subMap =
AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
ValueBoundsConstraintSet::Variable var(subMap, operands);
auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
mlir::presburger::BoundType::LB, var);
auto ubBound = ValueBoundsConstraintSet::computeConstantBound(
mlir::presburger::BoundType::UB, var, nullptr, true);
if (failed(lbBound) || failed(ubBound))
return std::nullopt;
if (type == presburger::BoundType::LB) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(lbBound.value()));
else
tripCount = lbBound.value();
} else if (type == presburger::BoundType::UB) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(ubBound.value()));
else
tripCount = ubBound.value();
} else {
return std::nullopt;
}
}
return tripCount;
}

/// Returns the trip count of the loop if it's a constant, std::nullopt
/// otherwise. This method uses affine expression analysis (in turn using
/// getTripCount) and is able to determine constant trip count in non-trivial
Expand All @@ -223,20 +259,21 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {

if (!map)
return std::nullopt;
return getKnownTripCountBound(map, operands, presburger::BoundType::LB);
}

// Take the min if all trip counts are constant.
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
} else
return std::nullopt;
}
return tripCount;
/// Returns the maximum trip count when the operand of forOp has a range. If the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what this does internally is compute an upper bound on each expression "potential upper bound - single lower bound" and take a minimum of that. Can you provide a mathematical justification as to why this provides a correct (and tight?) upper bound?

/// operand of forOp is a constant, the return value is the same as
/// `getConstantTripCount`.
std::optional<uint64_t>
mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
SmallVector<Value, 4> operands;
AffineMap map;
getTripCountMapAndOperands(forOp, &map, &operands);

if (!map)
return std::nullopt;
return getKnownTripCountBound(map, operands, presburger::BoundType::UB);
}

/// Returns the greatest known integral divisor of the trip count. Affine
Expand All @@ -256,8 +293,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
std::optional<uint64_t> gcd;
for (auto resultExpr : map.getResults()) {
uint64_t thisGcd;
if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
uint64_t tripCount = constExpr.getValue();
AffineMap subMap =
AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
ValueBoundsConstraintSet::Variable var(subMap, operands);
auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
mlir::presburger::BoundType::LB, var);
if (!failed(lbBound)) {
uint64_t tripCount = lbBound.value();
// 0 iteration loops (greatest divisor is 2^64 - 1).
if (tripCount == 0)
thisGcd = std::numeric_limits<uint64_t>::max();
Expand Down
38 changes: 26 additions & 12 deletions mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
if (!tripCount || *tripCount != 1)
std::optional<uint64_t> minTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
if (!minTripCount || *minTripCount != 1 || !maxTripCount ||
*maxTripCount != 1)
return failure();

// TODO: extend this for arbitrary affine bounds.
Expand Down Expand Up @@ -160,7 +162,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
forOp.getBody()->back().erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
forOp.erase();
IRRewriter b(forOp.getContext());
b.eraseOp(forOp);
return success();
}

Expand Down Expand Up @@ -884,15 +887,23 @@ void mlir::affine::getTileableBands(
/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
if (mayBeConstantTripCount.has_value()) {
uint64_t tripCount = *mayBeConstantTripCount;
if (tripCount == 0)
return success();
if (tripCount == 1)
return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}
return failure();
std::optional<uint64_t> maxMayBeConstantTripCount =
getUpperBoundOnTripCount(forOp);

if (!mayBeConstantTripCount.has_value() &&
!maxMayBeConstantTripCount.has_value())
return failure();

uint64_t tripCount = *mayBeConstantTripCount;
uint64_t maxTripCount = *maxMayBeConstantTripCount;

// Trip equals 0, this loop cannot unroll.
if (tripCount <= 0)
return success();

if (tripCount == 1 && maxTripCount == 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the trip count is known to be one, how can the max trip count be anything other than one?!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe maxTripCount will be equal to 2.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would getConstantMaxTripCount return a value different from the constant trip count when the trip count is known to be so? It shouldn't - otherwise, it's trivially loose.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are obviously talking about the CPU, which is indeed constant, but for hardware like GPU, threadId is a dynamic thing. The smallest threadid is 0, and the largest threadid is blocksize -1. The value of (upper - thread) / step is obviously not constant.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please look at the comments below, I'm wondering if affine-loop-unroll is not a pattern pass causing this issue (if you have the time. I'll continue to work on it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it can run it will definitely be a huge improvement, it's really exciting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of this confusion would be cleared up if tripCount were minTripCount

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it would. The semantics of affine loops is to take a minimum of values produced by each individual expression in the upper bound, it's unclear to me why we would need to reason about the upper bound.

return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}

/// Unrolls this loop by the specified factor or by the trip count (if constant)
Expand Down Expand Up @@ -1013,8 +1024,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
assert(unrollFactor > 0 && "unroll factor should be positive");

std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxMayBeConstantTripCount =
getUpperBoundOnTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
Expand Down
3 changes: 2 additions & 1 deletion mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
// Compute constant bound for `valueDim`.
int64_t ubAdjustment = closedUB ? 0 : 1;
if (auto bound = cstr.cstr.getConstantBound64(type, pos))
return type == BoundType::UB ? *bound + ubAdjustment : *bound;
if (bound.has_value())
return type == BoundType::UB ? *bound + ubAdjustment : *bound;
return failure();
}

Expand Down
92 changes: 92 additions & 0 deletions mlir/test/Dialect/Affine/unroll.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
// UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
// UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>

// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
func.func @loop_nest_simplest() {
Expand Down Expand Up @@ -258,6 +259,71 @@ gpu.module @unroll_full {
}
}

// UNROLL-FULL-LABEL: func @thread_partial_execution
func.func @thread_partial_execution() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we avoid using GPU dialect operations here? I suppose we have tests for the bound analysis somewhere that must be using test ops with known bounds, we could use those instead and not spuriously rely on the logic of another dialect here.

threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
// UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
// UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index
// UNROLL-FULL-NEXT: }
gpu.terminator
}
return
}

// UNROLL-FULL-LABEL: func @unroll_all_thread
func.func @unroll_all_thread() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
%4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
// UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
gpu.terminator
}
return
}

// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
func.func @partial_unroll_factor_4() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
gpu.terminator
}
// UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x
// UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
// UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.yield %[[SUM_3]] : index
// UNROLL-FULL-NEXT: }
return
}

// SHORT-LABEL: func @loop_nest_outer_unroll() {
func.func @loop_nest_outer_unroll() {
// SHORT: affine.for %arg0 = 0 to 4 {
Expand Down Expand Up @@ -701,6 +767,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
return %sum : f32
}

// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
func.func @gpu_launch_unroll_by_factor_4() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
gpu.terminator
}
// UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x
// UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
// UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
// UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index
// UNROLL-BY-4-NEXT: }
return
}

// UNROLL-FULL: func @unroll_zero_trip_count_case
func.func @unroll_zero_trip_count_case() {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 0
Expand Down