Skip to content

Commit f142f8a

Browse files
authored
[AMDGPU] Improve uniform argument handling in InstCombineIntrinsic (#105812)
Common up handling of intrinsics that are a no-op on uniform arguments. This catches a couple of new cases: readlane (readlane x, y), z -> readlane x, y (for any z, does not have to equal y). permlane64 (readfirstlane x) -> readfirstlane x (and likewise for any other uniform argument to permlane64).
1 parent c9b6339 commit f142f8a

File tree

2 files changed

+36
-40
lines changed

2 files changed

+36
-40
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 19 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,21 @@ static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
440440
SqrtOp->getType()->isHalfTy();
441441
}
442442

443+
/// Return true if we can easily prove that use U is uniform.
444+
static bool isTriviallyUniform(const Use &U) {
445+
Value *V = U.get();
446+
if (isa<Constant>(V))
447+
return true;
448+
if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
449+
if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
450+
return false;
451+
// If II and U are in different blocks then there is a possibility of
452+
// temporal divergence.
453+
return II->getParent() == cast<Instruction>(U.getUser())->getParent();
454+
}
455+
return false;
456+
}
457+
443458
std::optional<Instruction *>
444459
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445460
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1060,46 +1075,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
10601075
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
10611076
}
10621077
case Intrinsic::amdgcn_permlane64:
1063-
// A constant value is trivially uniform.
1064-
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1065-
return IC.replaceInstUsesWith(II, C);
1066-
}
1067-
break;
10681078
case Intrinsic::amdgcn_readfirstlane:
10691079
case Intrinsic::amdgcn_readlane: {
1070-
// A constant value is trivially uniform.
1071-
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1072-
return IC.replaceInstUsesWith(II, C);
1073-
}
1074-
1075-
// The rest of these may not be safe if the exec may not be the same between
1076-
// the def and use.
1077-
Value *Src = II.getArgOperand(0);
1078-
Instruction *SrcInst = dyn_cast<Instruction>(Src);
1079-
if (SrcInst && SrcInst->getParent() != II.getParent())
1080-
break;
1081-
1082-
// readfirstlane (readfirstlane x) -> readfirstlane x
1083-
// readlane (readfirstlane x), y -> readfirstlane x
1084-
if (match(Src,
1085-
PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1086-
return IC.replaceInstUsesWith(II, Src);
1087-
}
1088-
1089-
if (IID == Intrinsic::amdgcn_readfirstlane) {
1090-
// readfirstlane (readlane x, y) -> readlane x, y
1091-
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1092-
return IC.replaceInstUsesWith(II, Src);
1093-
}
1094-
} else {
1095-
// readlane (readlane x, y), y -> readlane x, y
1096-
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1097-
PatternMatch::m_Value(),
1098-
PatternMatch::m_Specific(II.getArgOperand(1))))) {
1099-
return IC.replaceInstUsesWith(II, Src);
1100-
}
1101-
}
1102-
1080+
// If the first argument is uniform these intrinsics return it unchanged.
1081+
const Use &Src = II.getArgOperandUse(0);
1082+
if (isTriviallyUniform(Src))
1083+
return IC.replaceInstUsesWith(II, Src.get());
11031084
break;
11041085
}
11051086
case Intrinsic::amdgcn_trig_preop: {

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2888,8 +2888,7 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
28882888
define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
28892889
; CHECK-LABEL: @readlane_idempotent_different_lanes(
28902890
; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
2891-
; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
2892-
; CHECK-NEXT: ret i32 [[READ1]]
2891+
; CHECK-NEXT: ret i32 [[READ0]]
28932892
;
28942893
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
28952894
%read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
@@ -3061,6 +3060,22 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1)
30613060
ret void
30623061
}
30633062

3063+
; --------------------------------------------------------------------
3064+
; llvm.amdgcn.permlane64
3065+
; --------------------------------------------------------------------
3066+
3067+
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0) {
3068+
; CHECK-LABEL: @permlane64_uniform(
3069+
; CHECK-NEXT: [[SRC1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0:%.*]])
3070+
; CHECK-NEXT: store i32 [[SRC1]], ptr addrspace(1) [[OUT:%.*]], align 4
3071+
; CHECK-NEXT: ret void
3072+
;
3073+
%src1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
3074+
%res = call i32 @llvm.amdgcn.permlane64(i32 %src1)
3075+
store i32 %res, ptr addrspace(1) %out
3076+
ret void
3077+
}
3078+
30643079
; --------------------------------------------------------------------
30653080
; llvm.amdgcn.image.sample a16
30663081
; --------------------------------------------------------------------

0 commit comments

Comments
 (0)