Skip to content

Commit 2278f5e

Browse files
authored
[AMDGPU] Hoist readlane/readfirstlane through unary/binary operands (#129037)
When a read(first)lane is used on a binary operator and the intrinsic is the only user of the operator, we can move the read(first)lane into the operand if the other operand is uniform. Unfortunately IC doesn't let us access UniformityAnalysis and thus we can't truly check uniformity, we have to do with a basic uniformity check which only allows constants or trivially uniform intrinsics calls. We can also do the same for unary and cast operators.
1 parent d05854d commit 2278f5e

File tree

4 files changed

+1008
-25
lines changed

4 files changed

+1008
-25
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 98 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "AMDGPUTargetTransformInfo.h"
1919
#include "GCNSubtarget.h"
2020
#include "llvm/ADT/FloatingPointMode.h"
21+
#include "llvm/IR/Dominators.h"
2122
#include "llvm/IR/IntrinsicsAMDGPU.h"
2223
#include "llvm/Transforms/InstCombine/InstCombiner.h"
2324
#include <optional>
@@ -503,6 +504,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
503504
return false;
504505
}
505506

507+
static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
508+
Function &NewCallee, ArrayRef<Value *> Ops) {
509+
SmallVector<OperandBundleDef, 2> OpBundles;
510+
Old.getOperandBundlesAsDefs(OpBundles);
511+
512+
CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
513+
NewCall->takeName(&Old);
514+
return NewCall;
515+
}
516+
517+
Instruction *
518+
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
519+
IntrinsicInst &II) const {
520+
const auto IID = II.getIntrinsicID();
521+
assert(IID == Intrinsic::amdgcn_readlane ||
522+
IID == Intrinsic::amdgcn_readfirstlane ||
523+
IID == Intrinsic::amdgcn_permlane64);
524+
525+
Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
526+
527+
// Only do this if both instructions are in the same block
528+
// (so the exec mask won't change) and the readlane is the only user of its
529+
// operand.
530+
if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
531+
return nullptr;
532+
533+
const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
534+
535+
// If this is a readlane, check that the second operand is a constant, or is
536+
// defined before OpInst so we know it's safe to move this intrinsic higher.
537+
Value *LaneID = nullptr;
538+
if (IsReadLane) {
539+
LaneID = II.getOperand(1);
540+
541+
// readlane take an extra operand for the lane ID, so we must check if that
542+
// LaneID value can be used at the point where we want to move the
543+
// intrinsic.
544+
if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
545+
if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
546+
return nullptr;
547+
}
548+
}
549+
550+
// Hoist the intrinsic (II) through OpInst.
551+
//
552+
// (II (OpInst x)) -> (OpInst (II x))
553+
const auto DoIt = [&](unsigned OpIdx,
554+
Function *NewIntrinsic) -> Instruction * {
555+
SmallVector<Value *, 2> Ops{OpInst->getOperand(OpIdx)};
556+
if (IsReadLane)
557+
Ops.push_back(LaneID);
558+
559+
// Rewrite the intrinsic call.
560+
CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
561+
562+
// Rewrite OpInst so it takes the result of the intrinsic now.
563+
Instruction &NewOp = *OpInst->clone();
564+
NewOp.setOperand(OpIdx, NewII);
565+
return &NewOp;
566+
};
567+
568+
// TODO(?): Should we do more with permlane64?
569+
if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
570+
return nullptr;
571+
572+
if (isa<UnaryOperator>(OpInst))
573+
return DoIt(0, II.getCalledFunction());
574+
575+
if (isa<CastInst>(OpInst)) {
576+
Value *Src = OpInst->getOperand(0);
577+
Type *SrcTy = Src->getType();
578+
if (!isTypeLegal(SrcTy))
579+
return nullptr;
580+
581+
Function *Remangled =
582+
Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
583+
return DoIt(0, Remangled);
584+
}
585+
586+
// We can also hoist through binary operators if the other operand is uniform.
587+
if (isa<BinaryOperator>(OpInst)) {
588+
// FIXME: If we had access to UniformityInfo here we could just check
589+
// if the operand is uniform.
590+
if (isTriviallyUniform(OpInst->getOperandUse(0)))
591+
return DoIt(1, II.getCalledFunction());
592+
if (isTriviallyUniform(OpInst->getOperandUse(1)))
593+
return DoIt(0, II.getCalledFunction());
594+
}
595+
596+
return nullptr;
597+
}
598+
506599
std::optional<Instruction *>
507600
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
508601
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1264,31 +1357,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
12641357
simplifyDemandedLaneMaskArg(IC, II, 1))
12651358
return &II;
12661359

1267-
// readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1268-
if (auto *BC = dyn_cast<BitCastInst>(Src);
1269-
BC && BC->hasOneUse() && IID != Intrinsic::amdgcn_ds_bpermute) {
1270-
Value *BCSrc = BC->getOperand(0);
1271-
1272-
// TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
1273-
if (isTypeLegal(BCSrc->getType())) {
1274-
Module *M = IC.Builder.GetInsertBlock()->getModule();
1275-
Function *Remangled =
1276-
Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
1277-
1278-
// Make sure convergence tokens are preserved.
1279-
// TODO: CreateIntrinsic should allow directly copying bundles
1280-
SmallVector<OperandBundleDef, 2> OpBundles;
1281-
II.getOperandBundlesAsDefs(OpBundles);
1282-
1283-
SmallVector<Value *, 3> Args(II.args());
1284-
Args[0] = BCSrc;
1285-
1286-
CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
1287-
NewCall->takeName(&II);
1288-
return new BitCastInst(NewCall, II.getType());
1289-
}
1290-
}
1291-
12921360
// If the lane argument of bpermute is uniform, change it to readlane. This
12931361
// generates better code and can enable further optimizations because
12941362
// readlane is AlwaysUniform.
@@ -1305,6 +1373,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
13051373
}
13061374
}
13071375

1376+
if (IID != Intrinsic::amdgcn_ds_bpermute) {
1377+
if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1378+
return Res;
1379+
}
1380+
13081381
return std::nullopt;
13091382
}
13101383
case Intrinsic::amdgcn_writelane: {

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
224224
const APInt &DemandedElts,
225225
APInt &UndefElts) const;
226226

227+
Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
228+
IntrinsicInst &II) const;
229+
227230
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
228231
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
229232
APInt &UndefElts2, APInt &UndefElts3,

0 commit comments

Comments
 (0)