34
34
#include " llvm/IR/InstrTypes.h"
35
35
#include " llvm/IR/Instructions.h"
36
36
#include " llvm/IR/Intrinsics.h"
37
+ #include " llvm/IR/IRBuilder.h"
37
38
#include " llvm/IR/Type.h"
38
39
#include " llvm/InitializePasses.h"
39
40
#include " llvm/Pass.h"
@@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
117
118
return true ;
118
119
}
119
120
121
+ static void removeDoneExport (Function &F) {
122
+ ConstantInt *BoolFalse = ConstantInt::getFalse (F.getContext ());
123
+ for (BasicBlock &BB : F) {
124
+ for (Instruction &I : BB) {
125
+ if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
126
+ if (Intrin->getIntrinsicID () == Intrinsic::amdgcn_exp) {
127
+ Intrin->setArgOperand (6 , BoolFalse); // done
128
+ } else if (Intrin->getIntrinsicID () == Intrinsic::amdgcn_exp_compr) {
129
+ Intrin->setArgOperand (4 , BoolFalse); // done
130
+ }
131
+ }
132
+ }
133
+ }
134
+ }
135
+
120
136
static BasicBlock *unifyReturnBlockSet (Function &F,
121
137
ArrayRef<BasicBlock *> ReturningBlocks,
138
+ bool InsertExport,
122
139
const TargetTransformInfo &TTI,
123
140
StringRef Name) {
124
141
// Otherwise, we need to insert a new basic block into the function, add a PHI
125
142
// nodes (if the function returns values), and convert all of the return
126
143
// instructions into unconditional branches.
127
144
BasicBlock *NewRetBlock = BasicBlock::Create (F.getContext (), Name, &F);
145
+ IRBuilder<> B (NewRetBlock);
146
+
147
+ if (InsertExport) {
148
+ // Ensure that there's only one "done" export in the shader by removing the
149
+ // "done" bit set on the original final export. More than one "done" export
150
+ // can lead to undefined behavior.
151
+ removeDoneExport (F);
152
+
153
+ Value *Undef = UndefValue::get (B.getFloatTy ());
154
+ B.CreateIntrinsic (Intrinsic::amdgcn_exp, { B.getFloatTy () },
155
+ {
156
+ B.getInt32 (9 ), // target, SQ_EXP_NULL
157
+ B.getInt32 (0 ), // enabled channels
158
+ Undef, Undef, Undef, Undef, // values
159
+ B.getTrue (), // done
160
+ B.getTrue (), // valid mask
161
+ });
162
+ }
128
163
129
164
PHINode *PN = nullptr ;
130
165
if (F.getReturnType ()->isVoidTy ()) {
131
- ReturnInst::Create (F. getContext (), nullptr , NewRetBlock );
166
+ B. CreateRetVoid ( );
132
167
} else {
133
168
// If the function doesn't return void... add a PHI node to the block...
134
- PN = PHINode::Create (F.getReturnType (), ReturningBlocks.size (),
135
- " UnifiedRetVal" );
136
- NewRetBlock-> getInstList (). push_back (PN );
137
- ReturnInst::Create (F. getContext (), PN, NewRetBlock );
169
+ PN = B. CreatePHI (F.getReturnType (), ReturningBlocks.size (),
170
+ " UnifiedRetVal" );
171
+ assert (!InsertExport );
172
+ B. CreateRet (PN );
138
173
}
139
174
140
175
// Loop over all of the blocks, replacing the return instruction with an
@@ -173,6 +208,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
173
208
// Dummy return block for infinite loop.
174
209
BasicBlock *DummyReturnBB = nullptr ;
175
210
211
+ bool InsertExport = false ;
212
+
176
213
for (BasicBlock *BB : PDT.getRoots ()) {
177
214
if (isa<ReturnInst>(BB->getTerminator ())) {
178
215
if (!isUniformlyReached (DA, *BB))
@@ -188,6 +225,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
188
225
" DummyReturnBlock" , &F);
189
226
Type *RetTy = F.getReturnType ();
190
227
Value *RetVal = RetTy->isVoidTy () ? nullptr : UndefValue::get (RetTy);
228
+
229
+ // For pixel shaders, the producer guarantees that an export is
230
+ // executed before each return instruction. However, if there is an
231
+ // infinite loop and we insert a return ourselves, we need to uphold
232
+ // that guarantee by inserting a null export. This can happen e.g. in
233
+ // an infinite loop with kill instructions, which is supposed to
234
+ // terminate. However, we don't need to do this if there is a non-void
235
+ // return value, since then there is an epilog afterwards which will
236
+ // still export.
237
+ //
238
+ // Note: In the case where only some threads enter the infinite loop,
239
+ // this can result in the null export happening redundantly after the
240
+ // original exports. However, The last "real" export happens after all
241
+ // the threads that didn't enter an infinite loop converged, which
242
+ // means that the only extra threads to execute the null export are
243
+ // threads that entered the infinite loop, and they only could've
244
+ // exited through being killed which sets their exec bit to 0.
245
+ // Therefore, unless there's an actual infinite loop, which can have
246
+ // invalid results, or there's a kill after the last export, which we
247
+ // assume the frontend won't do, this export will have the same exec
248
+ // mask as the last "real" export, and therefore the valid mask will be
249
+ // overwritten with the same value and will still be correct. Also,
250
+ // even though this forces an extra unnecessary export wait, we assume
251
+ // that this happens rare enough in practice to that we don't have to
252
+ // worry about performance.
253
+ if (F.getCallingConv () == CallingConv::AMDGPU_PS &&
254
+ RetTy->isVoidTy ()) {
255
+ InsertExport = true ;
256
+ }
257
+
191
258
ReturnInst::Create (F.getContext (), RetVal, DummyReturnBB);
192
259
ReturningBlocks.push_back (DummyReturnBB);
193
260
}
@@ -260,6 +327,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
260
327
const TargetTransformInfo &TTI
261
328
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI (F);
262
329
263
- unifyReturnBlockSet (F, ReturningBlocks, TTI, " UnifiedReturnBlock" );
330
+ unifyReturnBlockSet (F, ReturningBlocks, InsertExport, TTI, " UnifiedReturnBlock" );
264
331
return true ;
265
332
}
0 commit comments