34
34
#include " llvm/IR/InstrTypes.h"
35
35
#include " llvm/IR/Instructions.h"
36
36
#include " llvm/IR/Intrinsics.h"
37
- #include " llvm/IR/IRBuilder.h"
38
37
#include " llvm/IR/Type.h"
39
38
#include " llvm/InitializePasses.h"
40
39
#include " llvm/Pass.h"
@@ -118,58 +117,24 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
118
117
return true ;
119
118
}
120
119
121
- static void removeDoneExport (Function &F) {
122
- ConstantInt *BoolFalse = ConstantInt::getFalse (F.getContext ());
123
- for (BasicBlock &BB : F) {
124
- for (Instruction &I : BB) {
125
- if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
126
- if (Intrin->getIntrinsicID () == Intrinsic::amdgcn_exp) {
127
- Intrin->setArgOperand (6 , BoolFalse); // done
128
- } else if (Intrin->getIntrinsicID () == Intrinsic::amdgcn_exp_compr) {
129
- Intrin->setArgOperand (4 , BoolFalse); // done
130
- }
131
- }
132
- }
133
- }
134
- }
135
-
136
120
static BasicBlock *unifyReturnBlockSet (Function &F,
137
121
ArrayRef<BasicBlock *> ReturningBlocks,
138
- bool InsertExport,
139
122
const TargetTransformInfo &TTI,
140
123
StringRef Name) {
141
124
// Otherwise, we need to insert a new basic block into the function, add a PHI
142
125
// nodes (if the function returns values), and convert all of the return
143
126
// instructions into unconditional branches.
144
127
BasicBlock *NewRetBlock = BasicBlock::Create (F.getContext (), Name, &F);
145
- IRBuilder<> B (NewRetBlock);
146
-
147
- if (InsertExport) {
148
- // Ensure that there's only one "done" export in the shader by removing the
149
- // "done" bit set on the original final export. More than one "done" export
150
- // can lead to undefined behavior.
151
- removeDoneExport (F);
152
-
153
- Value *Undef = UndefValue::get (B.getFloatTy ());
154
- B.CreateIntrinsic (Intrinsic::amdgcn_exp, { B.getFloatTy () },
155
- {
156
- B.getInt32 (9 ), // target, SQ_EXP_NULL
157
- B.getInt32 (0 ), // enabled channels
158
- Undef, Undef, Undef, Undef, // values
159
- B.getTrue (), // done
160
- B.getTrue (), // valid mask
161
- });
162
- }
163
128
164
129
PHINode *PN = nullptr ;
165
130
if (F.getReturnType ()->isVoidTy ()) {
166
- B. CreateRetVoid ( );
131
+ ReturnInst::Create (F. getContext (), nullptr , NewRetBlock );
167
132
} else {
168
133
// If the function doesn't return void... add a PHI node to the block...
169
- PN = B. CreatePHI (F.getReturnType (), ReturningBlocks.size (),
170
- " UnifiedRetVal" );
171
- assert (!InsertExport );
172
- B. CreateRet (PN );
134
+ PN = PHINode::Create (F.getReturnType (), ReturningBlocks.size (),
135
+ " UnifiedRetVal" );
136
+ NewRetBlock-> getInstList (). push_back (PN );
137
+ ReturnInst::Create (F. getContext (), PN, NewRetBlock );
173
138
}
174
139
175
140
// Loop over all of the blocks, replacing the return instruction with an
@@ -208,8 +173,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
208
173
// Dummy return block for infinite loop.
209
174
BasicBlock *DummyReturnBB = nullptr ;
210
175
211
- bool InsertExport = false ;
212
-
213
176
for (BasicBlock *BB : PDT.getRoots ()) {
214
177
if (isa<ReturnInst>(BB->getTerminator ())) {
215
178
if (!isUniformlyReached (DA, *BB))
@@ -225,36 +188,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
225
188
" DummyReturnBlock" , &F);
226
189
Type *RetTy = F.getReturnType ();
227
190
Value *RetVal = RetTy->isVoidTy () ? nullptr : UndefValue::get (RetTy);
228
-
229
- // For pixel shaders, the producer guarantees that an export is
230
- // executed before each return instruction. However, if there is an
231
- // infinite loop and we insert a return ourselves, we need to uphold
232
- // that guarantee by inserting a null export. This can happen e.g. in
233
- // an infinite loop with kill instructions, which is supposed to
234
- // terminate. However, we don't need to do this if there is a non-void
235
- // return value, since then there is an epilog afterwards which will
236
- // still export.
237
- //
238
- // Note: In the case where only some threads enter the infinite loop,
239
- // this can result in the null export happening redundantly after the
240
- // original exports. However, The last "real" export happens after all
241
- // the threads that didn't enter an infinite loop converged, which
242
- // means that the only extra threads to execute the null export are
243
- // threads that entered the infinite loop, and they only could've
244
- // exited through being killed which sets their exec bit to 0.
245
- // Therefore, unless there's an actual infinite loop, which can have
246
- // invalid results, or there's a kill after the last export, which we
247
- // assume the frontend won't do, this export will have the same exec
248
- // mask as the last "real" export, and therefore the valid mask will be
249
- // overwritten with the same value and will still be correct. Also,
250
- // even though this forces an extra unnecessary export wait, we assume
251
- // that this happens rare enough in practice to that we don't have to
252
- // worry about performance.
253
- if (F.getCallingConv () == CallingConv::AMDGPU_PS &&
254
- RetTy->isVoidTy ()) {
255
- InsertExport = true ;
256
- }
257
-
258
191
ReturnInst::Create (F.getContext (), RetVal, DummyReturnBB);
259
192
ReturningBlocks.push_back (DummyReturnBB);
260
193
}
@@ -327,6 +260,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
327
260
const TargetTransformInfo &TTI
328
261
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI (F);
329
262
330
- unifyReturnBlockSet (F, ReturningBlocks, InsertExport, TTI, " UnifiedReturnBlock" );
263
+ unifyReturnBlockSet (F, ReturningBlocks, TTI, " UnifiedReturnBlock" );
331
264
return true ;
332
265
}
0 commit comments