Skip to content

Commit 7f46ac1

Browse files
committed
[AMDGPU] Fix AMDGPUUnifyDivergentExitNodes
Summary: For the case where "done" bits on existing exports are removed by unifyReturnBlockSet(), unify all return blocks - even the uniformly reached ones. We do not want to end up with a non-unified, uniformly reached block containing a normal export with the "done" bit cleared. That case is believed to be rare - possible with infinite loops in pixel shaders. This is a fix for D71192. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76364
1 parent 7102251 commit 7f46ac1

File tree

3 files changed

+56
-3
lines changed

3 files changed

+56
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
208208
// Loop over all of the blocks in a function, tracking all of the blocks that
209209
// return.
210210
SmallVector<BasicBlock *, 4> ReturningBlocks;
211+
SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
211212
SmallVector<BasicBlock *, 4> UnreachableBlocks;
212213

213214
// Dummy return block for infinite loop.
@@ -219,6 +220,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
219220
if (isa<ReturnInst>(BB->getTerminator())) {
220221
if (!isUniformlyReached(DA, *BB))
221222
ReturningBlocks.push_back(BB);
223+
else
224+
UniformlyReachedRetBlocks.push_back(BB);
222225
} else if (isa<UnreachableInst>(BB->getTerminator())) {
223226
if (!isUniformlyReached(DA, *BB))
224227
UnreachableBlocks.push_back(BB);
@@ -332,6 +335,18 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
332335
const TargetTransformInfo &TTI
333336
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
334337

335-
unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
338+
// Unify returning blocks. If we are going to insert the export it is also
339+
// necessary to include blocks that are uniformly reached, because in addition
340+
// to inserting the export the "done" bits on existing exports will be cleared
341+
// and we do not want to end up with the normal export in a non-unified,
342+
// uniformly reached block with the "done" bit cleared.
343+
auto BlocksToUnify = std::move(ReturningBlocks);
344+
if (InsertExport) {
345+
BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
346+
UniformlyReachedRetBlocks.end());
347+
}
348+
349+
unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
350+
"UnifiedReturnBlock");
336351
return true;
337352
}

llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,46 @@ bb5: ; preds = %bb3
719719
unreachable
720720
}
721721

722+
; Test that there is an extra export inserted after the normal export,
723+
; if the normal export is inside a uniformly reached block and there is
724+
; an infinite loop in the pixel shader.
725+
726+
; IR-LABEL: @uniformly_reached_export
727+
; IR-NEXT: .entry:
728+
; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]]
729+
730+
; IR: [[FLOW]]:
731+
; IR-NEXT: phi
732+
; IR-NEXT: br i1 [[CND2:%.*]], label %[[PREHEADER:.*]], label %[[FLOW2:.*]]
733+
734+
; IR: [[FLOW2]]:
735+
; IR-NEXT: br label %UnifiedReturnBlock
736+
737+
; IR: [[EXP]]:
738+
; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
739+
; IR-NEXT: br label %[[FLOW]]
740+
741+
; IR: UnifiedReturnBlock:
742+
; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
743+
; IR-NEXT: ret void
744+
745+
define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
746+
.entry:
747+
%tmp26 = fcmp olt float %tmp25, 0.000000e+00
748+
br i1 %tmp26, label %.preheader.1, label %bb27
749+
750+
.preheader.1: ; preds = %.entry
751+
br label %bb
752+
753+
bb: ; preds = %bb, %.preheader.1
754+
br label %bb
755+
756+
bb27: ; preds = %.entry
757+
call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
758+
ret void
759+
}
760+
761+
declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0
722762
declare i32 @llvm.amdgcn.workitem.id.x() #1
723763

724764
attributes #0 = { nounwind }

llvm/test/CodeGen/AMDGPU/update-phi.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ define amdgpu_ps void @_amdgpu_ps_main() local_unnamed_addr #3 {
1717
; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]]
1818
; IR: TransitionBlock:
1919
; IR-NEXT: br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]]
20-
; IR: n31:
21-
; IR-NEXT: ret void
2220
; IR: UnifiedReturnBlock:
2321
; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
2422
; IR-NEXT: ret void

0 commit comments

Comments
 (0)