Skip to content

Commit 66f2817

Browse files
committed
Add permlane64 + cast instructions
1 parent 6355608 commit 66f2817

File tree

5 files changed

+427
-68
lines changed

5 files changed

+427
-68
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 36 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -482,8 +482,14 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
482482
return false;
483483
}
484484

485-
Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
486-
IntrinsicInst &II) const {
485+
Instruction *
486+
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
487+
IntrinsicInst &II) const {
488+
const auto IID = II.getIntrinsicID();
489+
assert(IID == Intrinsic::amdgcn_readlane ||
490+
IID == Intrinsic::amdgcn_readfirstlane ||
491+
IID == Intrinsic::amdgcn_permlane64);
492+
487493
Instruction *Op = dyn_cast<Instruction>(II.getOperand(0));
488494

489495
// Only do this if both instructions are in the same block
@@ -492,7 +498,8 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
492498
if (!Op || !Op->hasOneUser() || Op->getParent() != II.getParent())
493499
return nullptr;
494500

495-
const bool IsReadLane = (II.getIntrinsicID() == Intrinsic::amdgcn_readlane);
501+
const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
502+
const bool IsPermLane = (IID == Intrinsic::amdgcn_permlane64);
496503

497504
// If this is a readlane, check that the second operand is a constant, or is
498505
// defined before Op so we know it's safe to move this intrinsic higher.
@@ -505,7 +512,8 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
505512
return nullptr;
506513
}
507514

508-
const auto DoIt = [&](unsigned OpIdx) -> Instruction * {
515+
const auto DoIt = [&](unsigned OpIdx,
516+
Function *NewIntrinsic) -> Instruction * {
509517
SmallVector<Value *, 2> Ops{Op->getOperand(OpIdx)};
510518
if (IsReadLane)
511519
Ops.push_back(LaneID);
@@ -515,27 +523,40 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
515523
SmallVector<OperandBundleDef, 2> OpBundles;
516524
II.getOperandBundlesAsDefs(OpBundles);
517525

518-
CallInst *NewII =
519-
IC.Builder.CreateCall(II.getCalledFunction(), Ops, OpBundles);
526+
CallInst *NewII = IC.Builder.CreateCall(NewIntrinsic, Ops, OpBundles);
527+
NewII->takeName(&II);
520528

521529
Instruction &NewOp = *Op->clone();
522530
NewOp.setOperand(OpIdx, NewII);
523531
return &NewOp;
524532
};
525533

526-
// TODO: Are any operations more expensive on the SALU than VALU, and thus
527-
// need to be excluded here?
528-
529534
if (isa<UnaryOperator>(Op))
530-
return DoIt(0);
535+
return DoIt(0, II.getCalledFunction());
536+
537+
if (isa<CastInst>(Op)) {
538+
Value *Src = Op->getOperand(0);
539+
Type *SrcTy = Src->getType();
540+
if (!isTypeLegal(SrcTy))
541+
return nullptr;
542+
543+
Function *Remangled =
544+
Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
545+
return DoIt(0, Remangled);
546+
}
531547

532-
if (isa<BinaryOperator>(Op)) {
548+
// Don't hoist through a binary operator for permlane64. It doesn't
549+
// achieve anything and we'd need to repeat the call on every operand.
550+
//
551+
// We can do it for read(first)lane if other operands are already scalar
552+
// because then we don't need to repeat the call.
553+
if (!IsPermLane && isa<BinaryOperator>(Op)) {
533554
// FIXME: If we had access to UniformityInfo here we could just check
534555
// if the operand is uniform.
535556
if (isTriviallyUniform(Op->getOperandUse(0)))
536-
return DoIt(1);
557+
return DoIt(1, II.getCalledFunction());
537558
if (isTriviallyUniform(Op->getOperandUse(1)))
538-
return DoIt(0);
559+
return DoIt(0, II.getCalledFunction());
539560
}
540561

541562
return nullptr;
@@ -1188,37 +1209,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11881209
simplifyDemandedLaneMaskArg(IC, II, 1))
11891210
return &II;
11901211

1191-
// readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1192-
if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
1193-
Value *BCSrc = BC->getOperand(0);
1194-
1195-
// TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
1196-
if (isTypeLegal(BCSrc->getType())) {
1197-
Module *M = IC.Builder.GetInsertBlock()->getModule();
1198-
Function *Remangled =
1199-
Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
1200-
1201-
// Make sure convergence tokens are preserved.
1202-
// TODO: CreateIntrinsic should allow directly copying bundles
1203-
SmallVector<OperandBundleDef, 2> OpBundles;
1204-
II.getOperandBundlesAsDefs(OpBundles);
1205-
1206-
SmallVector<Value *, 3> Args(II.args());
1207-
Args[0] = BCSrc;
1208-
1209-
CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
1210-
NewCall->takeName(&II);
1211-
return new BitCastInst(NewCall, II.getType());
1212-
}
1213-
}
1214-
1215-
// If the readfirstlane reads the result of an operation that exists
1216-
// both in the SALU and VALU, we may be able to hoist it higher in order
1217-
// to scalarize the expression.
1218-
if (IID != Intrinsic::amdgcn_permlane64) {
1219-
if (Instruction *Res = hoistReadLaneThroughOperand(IC, II))
1220-
return Res;
1221-
}
1212+
if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1213+
return Res;
12221214

12231215
return std::nullopt;
12241216
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
224224
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
225225
unsigned LaneAgIdx) const;
226226

227-
Instruction *hoistReadLaneThroughOperand(InstCombiner &IC,
228-
IntrinsicInst &II) const;
227+
Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
228+
IntrinsicInst &II) const;
229229

230230
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
231231
IntrinsicInst &II) const;
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s
3+
4+
; The readfirstlane version of this test covers all the interesting cases of the
5+
; shared logic. This testcase focuses on permlane64 specific pitfalls.
6+
7+
; test unary
8+
9+
define float @hoist_fneg_f32(float %arg) {
10+
; CHECK-LABEL: define float @hoist_fneg_f32(
11+
; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
12+
; CHECK-NEXT: [[BB:.*:]]
13+
; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
14+
; CHECK-NEXT: [[RFL:%.*]] = fneg float [[TMP0]]
15+
; CHECK-NEXT: ret float [[RFL]]
16+
;
17+
bb:
18+
%val = fneg float %arg
19+
%pl = call float @llvm.amdgcn.readfirstlane.f32(float %val)
20+
ret float %pl
21+
}
22+
23+
define double @hoist_fneg_f64(double %arg) {
24+
; CHECK-LABEL: define double @hoist_fneg_f64(
25+
; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] {
26+
; CHECK-NEXT: [[BB:.*:]]
27+
; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]])
28+
; CHECK-NEXT: [[RFL:%.*]] = fneg double [[TMP0]]
29+
; CHECK-NEXT: ret double [[RFL]]
30+
;
31+
bb:
32+
%val = fneg double %arg
33+
%pl = call double @llvm.amdgcn.readfirstlane.f64(double %val)
34+
ret double %pl
35+
}
36+
37+
; test casts
38+
39+
define i32 @hoist_trunc(i64 %arg) {
40+
; CHECK-LABEL: define i32 @hoist_trunc(
41+
; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] {
42+
; CHECK-NEXT: [[BB:.*:]]
43+
; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]])
44+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RFL]] to i32
45+
; CHECK-NEXT: ret i32 [[TMP0]]
46+
;
47+
bb:
48+
%val = trunc i64 %arg to i32
49+
%pl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val)
50+
ret i32 %pl
51+
}
52+
53+
define i64 @hoist_zext(i32 %arg) {
54+
; CHECK-LABEL: define i64 @hoist_zext(
55+
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
56+
; CHECK-NEXT: [[BB:.*:]]
57+
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
58+
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64
59+
; CHECK-NEXT: ret i64 [[TMP0]]
60+
;
61+
bb:
62+
%val = zext i32 %arg to i64
63+
%pl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val)
64+
ret i64 %pl
65+
}
66+
67+
; test binary i32
68+
69+
define i32 @hoist_add_i32(i32 %arg) {
70+
; CHECK-LABEL: define i32 @hoist_add_i32(
71+
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
72+
; CHECK-NEXT: [[BB:.*:]]
73+
; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], 16777215
74+
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[VAL]])
75+
; CHECK-NEXT: ret i32 [[RFL]]
76+
;
77+
bb:
78+
%val = add i32 %arg, 16777215
79+
%pl = call i32 @llvm.amdgcn.permlane64.i32(i32 %val)
80+
ret i32 %pl
81+
}
82+
83+
define float @hoist_fadd_f32(float %arg) {
84+
; CHECK-LABEL: define float @hoist_fadd_f32(
85+
; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
86+
; CHECK-NEXT: [[BB:.*:]]
87+
; CHECK-NEXT: [[VAL:%.*]] = fadd float [[ARG]], 1.280000e+02
88+
; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL]])
89+
; CHECK-NEXT: ret float [[RFL]]
90+
;
91+
bb:
92+
%val = fadd float %arg, 128.0
93+
%pl = call float @llvm.amdgcn.permlane64.f32(float %val)
94+
ret float %pl
95+
}
96+
97+
; test cases where hoisting isn't possible
98+
99+
define float @cross_block_hoisting(i1 %cond, float %arg) {
100+
; CHECK-LABEL: define float @cross_block_hoisting(
101+
; CHECK-SAME: i1 [[COND:%.*]], float [[ARG:%.*]]) #[[ATTR0]] {
102+
; CHECK-NEXT: [[BB:.*]]:
103+
; CHECK-NEXT: [[VAL:%.*]] = fneg float [[ARG]]
104+
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
105+
; CHECK: [[THEN]]:
106+
; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL]])
107+
; CHECK-NEXT: br label %[[END]]
108+
; CHECK: [[END]]:
109+
; CHECK-NEXT: [[RES:%.*]] = phi float [ [[RFL]], %[[THEN]] ], [ [[VAL]], %[[BB]] ]
110+
; CHECK-NEXT: ret float [[RES]]
111+
;
112+
bb:
113+
%val = fneg float %arg
114+
br i1 %cond, label %then, label %end
115+
116+
then:
117+
%pl = call float @llvm.amdgcn.permlane64.f32(float %val)
118+
br label %end
119+
120+
end:
121+
%res = phi float [%pl, %then], [%val, %bb]
122+
ret float %res
123+
}
124+
125+
; test that convergence tokens are preserved
126+
127+
define float @hoist_preserves_convergence_token(i1 %cond, float %arg) convergent {
128+
; CHECK-LABEL: define float @hoist_preserves_convergence_token(
129+
; CHECK-SAME: i1 [[COND:%.*]], float [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
130+
; CHECK-NEXT: [[BB:.*]]:
131+
; CHECK-NEXT: [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry()
132+
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
133+
; CHECK: [[THEN]]:
134+
; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[ARG]]) [ "convergencectrl"(token [[ENTRY]]) ]
135+
; CHECK-NEXT: [[TMP0:%.*]] = fneg float [[RFL]]
136+
; CHECK-NEXT: br label %[[END]]
137+
; CHECK: [[END]]:
138+
; CHECK-NEXT: [[RES:%.*]] = phi float [ [[TMP0]], %[[THEN]] ], [ [[ARG]], %[[BB]] ]
139+
; CHECK-NEXT: ret float [[RES]]
140+
;
141+
bb:
142+
%entry = call token @llvm.experimental.convergence.entry()
143+
br i1 %cond, label %then, label %end
144+
145+
then:
146+
%val = fneg float %arg
147+
%pl = call float @llvm.amdgcn.permlane64.f32(float %val) [ "convergencectrl"(token %entry)]
148+
br label %end
149+
150+
end:
151+
%res = phi float [%pl, %then], [%arg, %bb]
152+
ret float %res
153+
}

0 commit comments

Comments
 (0)