Skip to content

[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis #128113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);

/// In the GPU, the number of trip of each thread in the loop is inconsistent.
/// This function returns the maximum number of trip.
std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);

/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
Expand Down
3 changes: 3 additions & 0 deletions mlir/include/mlir/Dialect/Affine/LoopUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
/// was known to have a single iteration.
LogicalResult promoteIfSingleIteration(AffineForOp forOp);

/// Eliminate loops that will never actually execute.
LogicalResult removeInvalidLoop(AffineForOp forOp);

/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
/// their body into the containing Block.
void promoteSingleIterationLoops(func::FuncOp f);
Expand Down
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
static StringRef getNumWorkgroupAttributionsAttrName() {
return "workgroup_attributions";
}

/// Find BlockSize via the BlockArgument of gpu.launch.
Value getBlockSizeOnAxis(Value threadId);

/// Find BlockSize via the Dimension Information.
Value getBlockSizeOnAxis(Dimension dimension);
}];

let hasCanonicalizer = 1;
Expand Down
110 changes: 96 additions & 14 deletions mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "llvm/Support/MathExtras.h"

#include "llvm/ADT/DenseSet.h"
Expand Down Expand Up @@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}

/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
/// thread_id will be replaced by its minimum value 0.
static void replaceGPUOperands(AffineForOp forOp,
SmallVectorImpl<Value> &operands,
SmallVectorImpl<AffineExpr> &symReplacements,
unsigned numDim, bool replaceWithZero = false) {
auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
if (!launchOp)
return;

// `b` is only used to create `AffineExpr`.
Builder b(forOp.getContext());
unsigned idx = 0;

for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
Value operand = operands[i];
if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
operands[i] = blockSize;
if (!replaceWithZero)
symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
else
symReplacements.push_back(b.getAffineConstantExpr(0));
continue;
}

Operation *defOp = operand.getDefiningOp();
if (!defOp) {
++idx;
continue;
}

if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
gpu::Dimension dimension = threadIdOp.getDimension();
operands[i] = launchOp.getBlockSizeOnAxis(dimension);
if (!replaceWithZero)
symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
else
symReplacements.push_back(b.getAffineConstantExpr(0));
continue;
}
++idx;
}
}

/// Take the min if all trip counts are constant.
static std::optional<uint64_t>
getConstantTripCountFromAffineMap(AffineMap map) {
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
if (!constExpr)
return std::nullopt;
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
}
return tripCount;
}

/// Returns the trip count of the loop if it's a constant, std::nullopt
/// otherwise. This method uses affine expression analysis (in turn using
/// getTripCount) and is able to determine constant trip count in non-trivial
Expand All @@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {

if (!map)
return std::nullopt;
SmallVector<AffineExpr, 4> symReplacements;
replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
affine::AffineValueMap valueMap(map, operands);
(void)valueMap.canonicalize();
map = valueMap.getAffineMap();
return getConstantTripCountFromAffineMap(map);
}

// Take the min if all trip counts are constant.
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
} else
return std::nullopt;
}
return tripCount;
/// In some scenarios, such as GPU, the number of trip of each thread in the
/// loop is inconsistent. This function returns the maximum number of trip.
std::optional<uint64_t>
mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
SmallVector<Value, 4> operands;
AffineMap map;
getTripCountMapAndOperands(forOp, &map, &operands);

if (!map)
return std::nullopt;
SmallVector<AffineExpr, 4> symReplacements;
replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
affine::AffineValueMap valueMap(map, operands);
(void)valueMap.canonicalize();
map = valueMap.getAffineMap();
return getConstantTripCountFromAffineMap(map);
}

/// Returns the greatest known integral divisor of the trip count. Affine
Expand All @@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {

if (!map)
return 1;

SmallVector<AffineExpr, 4> symReplacements;
replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
affine::AffineValueMap valueMap(map, operands);
(void)valueMap.canonicalize();
map = valueMap.getAffineMap();
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
Expand Down
57 changes: 46 additions & 11 deletions mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/IRMapping.h"
Expand Down Expand Up @@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
}

/// Eliminate loops that will never actually execute
LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
return failure();

auto iterOperands = forOp.getInits();
auto results = forOp.getResults();
for (auto [result, operand] : llvm::zip(results, iterOperands))
result.replaceAllUsesWith(operand);

IRRewriter b(forOp);
b.eraseOp(forOp);
return success();
}

/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
if (!tripCount || *tripCount != 1)
std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
return failure();

// TODO: extend this for arbitrary affine bounds.
Expand Down Expand Up @@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
forOp.getBody()->back().erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
forOp.erase();
IRRewriter b(forOp.getContext());
b.eraseOp(forOp);
return success();
}

Expand Down Expand Up @@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
if (mayBeConstantTripCount.has_value()) {
uint64_t tripCount = *mayBeConstantTripCount;
if (tripCount == 0)
return success();
if (tripCount == 1)
return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}
return failure();
std::optional<uint64_t> maxMayBeConstantTripCount =
getMaxConstantTripCount(forOp);

if (!mayBeConstantTripCount.has_value() &&
!maxMayBeConstantTripCount.has_value())
return failure();

uint64_t tripCount = *mayBeConstantTripCount;
uint64_t maxTripCount = *maxMayBeConstantTripCount;

// The values of Trip are all 0, and the invalid loop is deleted.
if (tripCount <= 0 && maxTripCount <= 0)
return removeInvalidLoop(forOp);

// In special cases, such as in a GPU, only some threads execute this loop.
if (tripCount == 0 && maxTripCount == 1)
return success();

if (tripCount == 1 && maxTripCount == 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the trip count is known to be one, how can the max trip count be anything other than one?!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe maxTripCount will be equal to 2.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would getConstantMaxTripCount return a value different from the constant trip count when the trip count is known to be so? It shouldn't - otherwise, it's trivially loose.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are obviously talking about the CPU, which is indeed constant, but for hardware like GPU, threadId is a dynamic thing. The smallest threadid is 0, and the largest threadid is blocksize -1. The value of (upper - thread) / step is obviously not constant.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please look at the comments below, I'm wondering if affine-loop-unroll is not a pattern pass causing this issue (if you have the time. I'll continue to work on it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it can run it will definitely be a huge improvement, it's really exciting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of this confusion would be cleared up if tripCount were minTripCount

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it would. The semantics of affine loops is to take a minimum of values produced by each individual expression in the upper bound, it's unclear to me why we would need to reason about the upper bound.

return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}

/// Unrolls this loop by the specified factor or by the trip count (if constant)
Expand Down Expand Up @@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
assert(unrollFactor > 0 && "unroll factor should be positive");

std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxMayBeConstantTripCount =
getMaxConstantTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
Expand Down
20 changes: 20 additions & 0 deletions mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
return KernelDim3{operands[6], operands[7], operands[8]};
}

Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
if (dimension == Dimension::x)
return getBlockSizeX();
else if (dimension == Dimension::y)
return getBlockSizeY();
else
return getBlockSizeZ();
}

Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
KernelDim3 threadIds = getThreadIds();
if (threadIds.x == threadId)
return getBlockSizeX();
else if (threadIds.y == threadId)
return getBlockSizeY();
else if (threadIds.z == threadId)
return getBlockSizeZ();
return {};
}

LogicalResult LaunchOp::verify() {
if (!(hasClusterSize()) &&
(getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
Expand Down
Loading
Loading