Skip to content

Commit c36d5fd

Browse files
committed
Revert "Revert "AMDGPU: Fix handling of infinite loops in fragment shaders""
This reverts commit b6c8ff9.
1 parent 1d901ab commit c36d5fd

File tree

2 files changed

+141
-6
lines changed

2 files changed

+141
-6
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

Lines changed: 73 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "llvm/IR/InstrTypes.h"
3535
#include "llvm/IR/Instructions.h"
3636
#include "llvm/IR/Intrinsics.h"
37+
#include "llvm/IR/IRBuilder.h"
3738
#include "llvm/IR/Type.h"
3839
#include "llvm/InitializePasses.h"
3940
#include "llvm/Pass.h"
@@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
117118
return true;
118119
}
119120

121+
static void removeDoneExport(Function &F) {
122+
ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
123+
for (BasicBlock &BB : F) {
124+
for (Instruction &I : BB) {
125+
if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
126+
if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
127+
Intrin->setArgOperand(6, BoolFalse); // done
128+
} else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
129+
Intrin->setArgOperand(4, BoolFalse); // done
130+
}
131+
}
132+
}
133+
}
134+
}
135+
120136
static BasicBlock *unifyReturnBlockSet(Function &F,
121137
ArrayRef<BasicBlock *> ReturningBlocks,
138+
bool InsertExport,
122139
const TargetTransformInfo &TTI,
123140
StringRef Name) {
124141
// Otherwise, we need to insert a new basic block into the function, add a PHI
125142
// nodes (if the function returns values), and convert all of the return
126143
// instructions into unconditional branches.
127144
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
145+
IRBuilder<> B(NewRetBlock);
146+
147+
if (InsertExport) {
148+
// Ensure that there's only one "done" export in the shader by removing the
149+
// "done" bit set on the original final export. More than one "done" export
150+
// can lead to undefined behavior.
151+
removeDoneExport(F);
152+
153+
Value *Undef = UndefValue::get(B.getFloatTy());
154+
B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
155+
{
156+
B.getInt32(9), // target, SQ_EXP_NULL
157+
B.getInt32(0), // enabled channels
158+
Undef, Undef, Undef, Undef, // values
159+
B.getTrue(), // done
160+
B.getTrue(), // valid mask
161+
});
162+
}
128163

129164
PHINode *PN = nullptr;
130165
if (F.getReturnType()->isVoidTy()) {
131-
ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
166+
B.CreateRetVoid();
132167
} else {
133168
// If the function doesn't return void... add a PHI node to the block...
134-
PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
135-
"UnifiedRetVal");
136-
NewRetBlock->getInstList().push_back(PN);
137-
ReturnInst::Create(F.getContext(), PN, NewRetBlock);
169+
PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
170+
"UnifiedRetVal");
171+
assert(!InsertExport);
172+
B.CreateRet(PN);
138173
}
139174

140175
// Loop over all of the blocks, replacing the return instruction with an
@@ -173,6 +208,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
173208
// Dummy return block for infinite loop.
174209
BasicBlock *DummyReturnBB = nullptr;
175210

211+
bool InsertExport = false;
212+
176213
for (BasicBlock *BB : PDT.getRoots()) {
177214
if (isa<ReturnInst>(BB->getTerminator())) {
178215
if (!isUniformlyReached(DA, *BB))
@@ -188,6 +225,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
188225
"DummyReturnBlock", &F);
189226
Type *RetTy = F.getReturnType();
190227
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
228+
229+
// For pixel shaders, the producer guarantees that an export is
230+
// executed before each return instruction. However, if there is an
231+
// infinite loop and we insert a return ourselves, we need to uphold
232+
// that guarantee by inserting a null export. This can happen e.g. in
233+
// an infinite loop with kill instructions, which is supposed to
234+
// terminate. However, we don't need to do this if there is a non-void
235+
// return value, since then there is an epilog afterwards which will
236+
// still export.
237+
//
238+
// Note: In the case where only some threads enter the infinite loop,
239+
// this can result in the null export happening redundantly after the
240+
// original exports. However, The last "real" export happens after all
241+
// the threads that didn't enter an infinite loop converged, which
242+
// means that the only extra threads to execute the null export are
243+
// threads that entered the infinite loop, and they only could've
244+
// exited through being killed which sets their exec bit to 0.
245+
// Therefore, unless there's an actual infinite loop, which can have
246+
// invalid results, or there's a kill after the last export, which we
247+
// assume the frontend won't do, this export will have the same exec
248+
// mask as the last "real" export, and therefore the valid mask will be
249+
// overwritten with the same value and will still be correct. Also,
250+
// even though this forces an extra unnecessary export wait, we assume
251+
// that this happens rare enough in practice to that we don't have to
252+
// worry about performance.
253+
if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
254+
RetTy->isVoidTy()) {
255+
InsertExport = true;
256+
}
257+
191258
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
192259
ReturningBlocks.push_back(DummyReturnBB);
193260
}
@@ -260,6 +327,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
260327
const TargetTransformInfo &TTI
261328
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
262329

263-
unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
330+
unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
264331
return true;
265332
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
2+
; Although it's modeled without any control flow in order to get better code
3+
; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
4+
; it with "true". In case it's called in a provably infinite loop, we still
5+
; need to successfully exit and export something, even if we can't know where
6+
; to jump to in the LLVM IR. Therefore we insert a null export ourselves in
7+
; this case right before the s_endpgm to avoid GPU hangs, which is what this
8+
; tests.
9+
10+
; CHECK-LABEL: return_void
11+
; Make sure that we remove the done bit from the original export
12+
; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm
13+
; CHECK: exp null off, off, off, off done vm
14+
; CHECK-NEXT: s_endpgm
15+
define amdgpu_ps void @return_void(float %0) #0 {
16+
main_body:
17+
%cmp = fcmp olt float %0, 1.000000e+01
18+
br i1 %cmp, label %end, label %loop
19+
20+
loop:
21+
call void @llvm.amdgcn.kill(i1 false) #3
22+
br label %loop
23+
24+
end:
25+
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 0., float 0., float 0., float 1., i1 true, i1 true) #3
26+
ret void
27+
}
28+
29+
; Check that we also remove the done bit from compressed exports correctly.
30+
; CHECK-LABEL: return_void_compr
31+
; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm
32+
; CHECK: exp null off, off, off, off done vm
33+
; CHECK-NEXT: s_endpgm
34+
define amdgpu_ps void @return_void_compr(float %0) #0 {
35+
main_body:
36+
%cmp = fcmp olt float %0, 1.000000e+01
37+
br i1 %cmp, label %end, label %loop
38+
39+
loop:
40+
call void @llvm.amdgcn.kill(i1 false) #3
41+
br label %loop
42+
43+
end:
44+
call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 5, <2 x i16> < i16 0, i16 0 >, <2 x i16> < i16 0, i16 0 >, i1 true, i1 true) #3
45+
ret void
46+
}
47+
48+
; In case there's an epilog, we shouldn't have to do this.
49+
; CHECK-LABEL: return_nonvoid
50+
; CHECK-NOT: exp null off, off, off, off done vm
51+
define amdgpu_ps float @return_nonvoid(float %0) #0 {
52+
main_body:
53+
%cmp = fcmp olt float %0, 1.000000e+01
54+
br i1 %cmp, label %end, label %loop
55+
56+
loop:
57+
call void @llvm.amdgcn.kill(i1 false) #3
58+
br label %loop
59+
60+
end:
61+
ret float 0.
62+
}
63+
64+
declare void @llvm.amdgcn.kill(i1) #0
65+
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
66+
declare void @llvm.amdgcn.exp.compr.v2i16(i32 immarg, i32 immarg, <2 x i16>, <2 x i16>, i1 immarg, i1 immarg) #0
67+
68+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)