Skip to content

[AMDGPU] Hoist readlane/readfirstlane through unary/binary operands #129037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 98 additions & 25 deletions llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "GCNSubtarget.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <optional>
Expand Down Expand Up @@ -481,6 +482,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
return false;
}

static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
Function &NewCallee, ArrayRef<Value *> Ops) {
SmallVector<OperandBundleDef, 2> OpBundles;
Old.getOperandBundlesAsDefs(OpBundles);

CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
NewCall->takeName(&Old);
return NewCall;
}

Instruction *
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
IntrinsicInst &II) const {
const auto IID = II.getIntrinsicID();
assert(IID == Intrinsic::amdgcn_readlane ||
IID == Intrinsic::amdgcn_readfirstlane ||
IID == Intrinsic::amdgcn_permlane64);

Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));

// Only do this if both instructions are in the same block
// (so the exec mask won't change) and the readlane is the only user of its
// operand.
if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
return nullptr;

const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);

// If this is a readlane, check that the second operand is a constant, or is
// defined before OpInst so we know it's safe to move this intrinsic higher.
Value *LaneID = nullptr;
if (IsReadLane) {
LaneID = II.getOperand(1);

// readlane take an extra operand for the lane ID, so we must check if that
// LaneID value can be used at the point where we want to move the
// intrinsic.
if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
return nullptr;
}
}

// Hoist the intrinsic (II) through OpInst.
//
// (II (OpInst x)) -> (OpInst (II x))
const auto DoIt = [&](unsigned OpIdx,
Function *NewIntrinsic) -> Instruction * {
Comment on lines +531 to +532
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to a properly named utility function, the same thing is already duplicated elsewhere

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean the CreateCall + operand bundle copy bit? Or the entire lambda?
I only see one other place in the file that calls getOperandBundlesAsDefs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, also the clone is also suspicious. I'd expect a Create or a clone, not both

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Create is for the intrinsic call, because we may need to remangle it if we hoist through a cast
Clone is for the instruction that was previously the operand of the intrinsic

So if we have (readfirstlane (zext x)), readfirstlane is II (Re-Created), zext is OpInst (Cloned)

SmallVector<Value *, 2> Ops{OpInst->getOperand(OpIdx)};
if (IsReadLane)
Ops.push_back(LaneID);

// Rewrite the intrinsic call.
CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);

// Rewrite OpInst so it takes the result of the intrinsic now.
Instruction &NewOp = *OpInst->clone();
NewOp.setOperand(OpIdx, NewII);
return &NewOp;
};

// TODO(?): Should we do more with permlane64?
if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
return nullptr;

if (isa<UnaryOperator>(OpInst))
return DoIt(0, II.getCalledFunction());

if (isa<CastInst>(OpInst)) {
Value *Src = OpInst->getOperand(0);
Type *SrcTy = Src->getType();
if (!isTypeLegal(SrcTy))
return nullptr;

Function *Remangled =
Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
return DoIt(0, Remangled);
}

// We can also hoist through binary operators if the other operand is uniform.
if (isa<BinaryOperator>(OpInst)) {
// FIXME: If we had access to UniformityInfo here we could just check
// if the operand is uniform.
if (isTriviallyUniform(OpInst->getOperandUse(0)))
return DoIt(1, II.getCalledFunction());
if (isTriviallyUniform(OpInst->getOperandUse(1)))
return DoIt(0, II.getCalledFunction());
}

return nullptr;
}

std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
Expand Down Expand Up @@ -1173,31 +1266,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
simplifyDemandedLaneMaskArg(IC, II, 1))
return &II;

// readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
if (auto *BC = dyn_cast<BitCastInst>(Src);
BC && BC->hasOneUse() && IID != Intrinsic::amdgcn_ds_bpermute) {
Value *BCSrc = BC->getOperand(0);

// TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
if (isTypeLegal(BCSrc->getType())) {
Module *M = IC.Builder.GetInsertBlock()->getModule();
Function *Remangled =
Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});

// Make sure convergence tokens are preserved.
// TODO: CreateIntrinsic should allow directly copying bundles
SmallVector<OperandBundleDef, 2> OpBundles;
II.getOperandBundlesAsDefs(OpBundles);

SmallVector<Value *, 3> Args(II.args());
Args[0] = BCSrc;

CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
NewCall->takeName(&II);
return new BitCastInst(NewCall, II.getType());
}
}

// If the lane argument of bpermute is uniform, change it to readlane. This
// generates better code and can enable further optimizations because
// readlane is AlwaysUniform.
Expand All @@ -1214,6 +1282,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
}

if (IID != Intrinsic::amdgcn_ds_bpermute) {
if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
return Res;
}

return std::nullopt;
}
case Intrinsic::amdgcn_writelane: {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
const APInt &DemandedElts,
APInt &UndefElts) const;

Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
IntrinsicInst &II) const;

std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
Expand Down
Loading