Skip to content

Commit 87d98c1

Browse files
committed
AMDGPU: Fix handling of infinite loops in fragment shaders
Summary: Due to the fact that kill is just a normal intrinsic, even though it's supposed to terminate the thread, we can end up with provably infinite loops that are actually supposed to end successfully. The AMDGPUUnifyDivergentExitNodes pass breaks up these loops, but because there's no obvious place to make the loop branch to, it just makes it return immediately, which skips the exports that are supposed to happen at the end and hangs the GPU if all the threads end up being killed. While it would be nice if the fact that kill terminates the thread were modeled in the IR, I think that the structurizer as-is would make a mess if we did that when the kill is inside control flow. For now, we just add a null export at the end to make sure that it always exports something, which fixes the immediate problem without penalizing the more common case. This means that we sometimes do two "done" exports when only some of the threads enter the discard loop, but from tests the hardware seems ok with that. This fixes dEQP-VK.graphicsfuzz.while-inside-switch with radv. Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70781
1 parent 94e8ef4 commit 87d98c1

File tree

2 files changed

+141
-6
lines changed

2 files changed

+141
-6
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

Lines changed: 73 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "llvm/IR/InstrTypes.h"
3535
#include "llvm/IR/Instructions.h"
3636
#include "llvm/IR/Intrinsics.h"
37+
#include "llvm/IR/IRBuilder.h"
3738
#include "llvm/IR/Type.h"
3839
#include "llvm/InitializePasses.h"
3940
#include "llvm/Pass.h"
@@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
117118
return true;
118119
}
119120

121+
static void removeDoneExport(Function &F) {
122+
ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
123+
for (BasicBlock &BB : F) {
124+
for (Instruction &I : BB) {
125+
if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
126+
if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
127+
Intrin->setArgOperand(6, BoolFalse); // done
128+
} else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
129+
Intrin->setArgOperand(4, BoolFalse); // done
130+
}
131+
}
132+
}
133+
}
134+
}
135+
120136
static BasicBlock *unifyReturnBlockSet(Function &F,
121137
ArrayRef<BasicBlock *> ReturningBlocks,
138+
bool InsertExport,
122139
const TargetTransformInfo &TTI,
123140
StringRef Name) {
124141
// Otherwise, we need to insert a new basic block into the function, add a PHI
125142
// nodes (if the function returns values), and convert all of the return
126143
// instructions into unconditional branches.
127144
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
145+
IRBuilder<> B(NewRetBlock);
146+
147+
if (InsertExport) {
148+
// Ensure that there's only one "done" export in the shader by removing the
149+
// "done" bit set on the original final export. More than one "done" export
150+
// can lead to undefined behavior.
151+
removeDoneExport(F);
152+
153+
Value *Undef = UndefValue::get(B.getFloatTy());
154+
B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
155+
{
156+
B.getInt32(9), // target, SQ_EXP_NULL
157+
B.getInt32(0), // enabled channels
158+
Undef, Undef, Undef, Undef, // values
159+
B.getTrue(), // done
160+
B.getTrue(), // valid mask
161+
});
162+
}
128163

129164
PHINode *PN = nullptr;
130165
if (F.getReturnType()->isVoidTy()) {
131-
ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
166+
B.CreateRetVoid();
132167
} else {
133168
// If the function doesn't return void... add a PHI node to the block...
134-
PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
135-
"UnifiedRetVal");
136-
NewRetBlock->getInstList().push_back(PN);
137-
ReturnInst::Create(F.getContext(), PN, NewRetBlock);
169+
PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
170+
"UnifiedRetVal");
171+
assert(!InsertExport);
172+
B.CreateRet(PN);
138173
}
139174

140175
// Loop over all of the blocks, replacing the return instruction with an
@@ -173,6 +208,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
173208
// Dummy return block for infinite loop.
174209
BasicBlock *DummyReturnBB = nullptr;
175210

211+
bool InsertExport = false;
212+
176213
for (BasicBlock *BB : PDT.getRoots()) {
177214
if (isa<ReturnInst>(BB->getTerminator())) {
178215
if (!isUniformlyReached(DA, *BB))
@@ -188,6 +225,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
188225
"DummyReturnBlock", &F);
189226
Type *RetTy = F.getReturnType();
190227
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
228+
229+
// For pixel shaders, the producer guarantees that an export is
230+
// executed before each return instruction. However, if there is an
231+
// infinite loop and we insert a return ourselves, we need to uphold
232+
// that guarantee by inserting a null export. This can happen e.g. in
233+
// an infinite loop with kill instructions, which is supposed to
234+
// terminate. However, we don't need to do this if there is a non-void
235+
// return value, since then there is an epilog afterwards which will
236+
// still export.
237+
//
238+
// Note: In the case where only some threads enter the infinite loop,
239+
// this can result in the null export happening redundantly after the
240+
// original exports. However, The last "real" export happens after all
241+
// the threads that didn't enter an infinite loop converged, which
242+
// means that the only extra threads to execute the null export are
243+
// threads that entered the infinite loop, and they only could've
244+
// exited through being killed which sets their exec bit to 0.
245+
// Therefore, unless there's an actual infinite loop, which can have
246+
// invalid results, or there's a kill after the last export, which we
247+
// assume the frontend won't do, this export will have the same exec
248+
// mask as the last "real" export, and therefore the valid mask will be
249+
// overwritten with the same value and will still be correct. Also,
250+
// even though this forces an extra unnecessary export wait, we assume
251+
// that this happens rare enough in practice to that we don't have to
252+
// worry about performance.
253+
if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
254+
RetTy->isVoidTy()) {
255+
InsertExport = true;
256+
}
257+
191258
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
192259
ReturningBlocks.push_back(DummyReturnBB);
193260
}
@@ -260,6 +327,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
260327
const TargetTransformInfo &TTI
261328
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
262329

263-
unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
330+
unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
264331
return true;
265332
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
2+
; Although it's modeled without any control flow in order to get better code
3+
; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
4+
; it with "true". In case it's called in a provably infinite loop, we still
5+
; need to successfully exit and export something, even if we can't know where
6+
; to jump to in the LLVM IR. Therefore we insert a null export ourselves in
7+
; this case right before the s_endpgm to avoid GPU hangs, which is what this
8+
; tests.
9+
10+
; CHECK-LABEL: return_void
11+
; Make sure that we remove the done bit from the original export
12+
; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm
13+
; CHECK: exp null off, off, off, off done vm
14+
; CHECK-NEXT: s_endpgm
15+
define amdgpu_ps void @return_void(float %0) #0 {
16+
main_body:
17+
%cmp = fcmp olt float %0, 1.000000e+01
18+
br i1 %cmp, label %end, label %loop
19+
20+
loop:
21+
call void @llvm.amdgcn.kill(i1 false) #3
22+
br label %loop
23+
24+
end:
25+
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 0., float 0., float 0., float 1., i1 true, i1 true) #3
26+
ret void
27+
}
28+
29+
; Check that we also remove the done bit from compressed exports correctly.
30+
; CHECK-LABEL: return_void_compr
31+
; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm
32+
; CHECK: exp null off, off, off, off done vm
33+
; CHECK-NEXT: s_endpgm
34+
define amdgpu_ps void @return_void_compr(float %0) #0 {
35+
main_body:
36+
%cmp = fcmp olt float %0, 1.000000e+01
37+
br i1 %cmp, label %end, label %loop
38+
39+
loop:
40+
call void @llvm.amdgcn.kill(i1 false) #3
41+
br label %loop
42+
43+
end:
44+
call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 5, <2 x i16> < i16 0, i16 0 >, <2 x i16> < i16 0, i16 0 >, i1 true, i1 true) #3
45+
ret void
46+
}
47+
48+
; In case there's an epilog, we shouldn't have to do this.
49+
; CHECK-LABEL: return_nonvoid
50+
; CHECK-NOT: exp null off, off, off, off done vm
51+
define amdgpu_ps float @return_nonvoid(float %0) #0 {
52+
main_body:
53+
%cmp = fcmp olt float %0, 1.000000e+01
54+
br i1 %cmp, label %end, label %loop
55+
56+
loop:
57+
call void @llvm.amdgcn.kill(i1 false) #3
58+
br label %loop
59+
60+
end:
61+
ret float 0.
62+
}
63+
64+
declare void @llvm.amdgcn.kill(i1) #0
65+
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
66+
declare void @llvm.amdgcn.exp.compr.v2i16(i32 immarg, i32 immarg, <2 x i16>, <2 x i16>, i1 immarg, i1 immarg) #0
67+
68+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)