Skip to content

Commit ca9afc9

Browse files
piotrAMDperlfu
authored andcommitted
[AMDGPU] Optimize conditional discard
Add a new IR level pass to the AMDGPU backend to optimize conditional discard in the pixel shaders. This pass transforms conditional discard of the form: if (condition) discard (false); into: discard (!condition); The transformation is useful, because removing basic blocks simplifies CFG and limits the number of phi nodes used. Change-Id: Ie6eaa42618f999e431e8ac514f0f2535ce26c93e
1 parent 3df0df0 commit ca9afc9

File tree

7 files changed

+337
-2
lines changed

7 files changed

+337
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ FunctionPass *createAMDGPUAtomicOptimizerPass();
9191
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
9292
extern char &AMDGPUAtomicOptimizerID;
9393

94+
FunctionPass *createAMDGPUConditionalDiscardPass();
95+
void initializeAMDGPUConditionalDiscardPass(PassRegistry &);
96+
extern char &AMDGPUConditionalDiscardID;
97+
9498
ModulePass *createAMDGPULowerIntrinsicsPass();
9599
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
96100
extern char &AMDGPULowerIntrinsicsID;
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
//===-- AMDGPUConditionalDiscard.cpp
2+
//-----------------------------------------===//
3+
//
4+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
/// \file
11+
/// This pass transforms conditional discard of the form:
12+
///
13+
/// if (condition)
14+
/// discard (false);
15+
///
16+
/// into:
17+
///
18+
/// discard (!condition);
19+
///
20+
///
21+
/// More specifically,
22+
///
23+
/// ...
24+
/// block:
25+
/// %cond = icmp eq i32 %a, %b
26+
/// br i1 %cond, label %kill_block, label %cont_block
27+
///
28+
/// kill_block:
29+
/// call void @llvm.amdgcn.kill(i1 false)
30+
/// br label %other
31+
/// ...
32+
///
33+
/// gets transformed into:
34+
///
35+
/// ...
36+
/// block:
37+
/// %cond = icmp eq i32 %a, %b
38+
/// %nonkill = not i1 %cond
39+
/// call void @llvm.amdgcn.kill(i1 %nonkill)
40+
/// br label %cont_block
41+
/// ...
42+
///
43+
/// The transformation is useful, because removing basic blocks simplifies CFG
44+
/// and limits the number of phi nodes used.
45+
/// The pass should ideally be placed after code sinking, because some sinking
46+
/// opportunities get lost after the transformation due to the basic block
47+
/// removal.
48+
49+
#include "AMDGPU.h"
50+
#include "AMDGPUSubtarget.h"
51+
#include "llvm/Analysis/LoopInfo.h"
52+
#include "llvm/IR/IRBuilder.h"
53+
#include "llvm/IR/InstVisitor.h"
54+
#include "llvm/InitializePasses.h"
55+
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
56+
57+
#define DEBUG_TYPE "amdgpu-conditional-discard"
58+
59+
using namespace llvm;
60+
using namespace llvm::AMDGPU;
61+
62+
namespace {
63+
64+
class AMDGPUConditionalDiscard : public FunctionPass {
65+
private:
66+
SmallVector<BasicBlock *, 4> KillBlocksToRemove;
67+
68+
const LoopInfo *LI;
69+
70+
public:
71+
static char ID;
72+
73+
AMDGPUConditionalDiscard() : FunctionPass(ID) {}
74+
75+
bool runOnFunction(Function &F) override;
76+
77+
void getAnalysisUsage(AnalysisUsage &AU) const override {
78+
AU.addRequiredTransitive<LoopInfoWrapperPass>();
79+
}
80+
81+
82+
StringRef getPassName() const override { return "AMDGPUConditionalDiscard"; }
83+
84+
void optimizeBlock(BasicBlock &BB);
85+
};
86+
87+
} // namespace
88+
89+
char AMDGPUConditionalDiscard::ID = 0;
90+
91+
char &llvm::AMDGPUConditionalDiscardID = AMDGPUConditionalDiscard::ID;
92+
93+
// Look for a basic block that has only a single predecessor and its
94+
// first instruction is a call to amdgcn_kill, with "false" as argument.
95+
// Transform the branch condition of the block's predecessor and mark
96+
// the block for removal. Clone the call to amdgcn_kill to the predecessor.
97+
void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
98+
99+
if (auto *KillCand = dyn_cast<CallInst>(&BB.front())) {
100+
auto *Callee = KillCand->getCalledFunction();
101+
if (!Callee || Callee->getIntrinsicID() != Intrinsic::amdgcn_kill) {
102+
return;
103+
}
104+
105+
ConstantInt *Val = dyn_cast<ConstantInt>(KillCand->getOperand(0));
106+
if (!Val || !Val->isZero())
107+
return;
108+
109+
auto *PredBlock = BB.getSinglePredecessor();
110+
if (!PredBlock)
111+
return;
112+
113+
// Skip if the kill is in a loop.
114+
if (LI->getLoopFor(PredBlock))
115+
return;
116+
117+
auto *PredTerminator = PredBlock->getTerminator();
118+
auto *PredBranchInst = dyn_cast<BranchInst>(PredTerminator);
119+
120+
if (!PredBranchInst || !PredBranchInst->isConditional())
121+
return;
122+
123+
BasicBlock *LiveBlock = nullptr;
124+
auto *Cond = PredBranchInst->getCondition();
125+
126+
if (PredBranchInst->getSuccessor(0) == &BB) {
127+
// The old kill block could only be reached if
128+
// the condition was true - negate the condition.
129+
Cond = BinaryOperator::CreateNot(Cond, "", PredTerminator);
130+
LiveBlock = PredBranchInst->getSuccessor(1);
131+
} else {
132+
LiveBlock = PredBranchInst->getSuccessor(0);
133+
}
134+
135+
auto *NewKill = cast<CallInst>(KillCand->clone());
136+
137+
NewKill->setArgOperand(0, Cond);
138+
NewKill->insertBefore(PredTerminator);
139+
140+
KillBlocksToRemove.push_back(&BB);
141+
142+
// Change the branch to an unconditional one, targeting the live block.
143+
auto *NewBranchInst = BranchInst::Create(LiveBlock, PredBranchInst);
144+
NewBranchInst->copyMetadata(*PredBranchInst);
145+
PredBranchInst->eraseFromParent();
146+
}
147+
}
148+
149+
bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
150+
if (F.getCallingConv() != CallingConv::AMDGPU_PS)
151+
return false;
152+
153+
if (skipFunction(F)) {
154+
return false;
155+
}
156+
157+
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
158+
159+
for (auto &BB : F)
160+
optimizeBlock(BB);
161+
162+
for (auto *BB : KillBlocksToRemove) {
163+
for (auto *Succ : successors(BB)) {
164+
for (PHINode &PN : Succ->phis())
165+
PN.removeIncomingValue(BB);
166+
}
167+
BB->eraseFromParent();
168+
}
169+
bool Ret = !KillBlocksToRemove.empty();
170+
KillBlocksToRemove.clear();
171+
172+
return Ret;
173+
}
174+
175+
INITIALIZE_PASS_BEGIN(AMDGPUConditionalDiscard, DEBUG_TYPE,
176+
"Transform conditional discard", false, false)
177+
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
178+
INITIALIZE_PASS_END(AMDGPUConditionalDiscard, DEBUG_TYPE,
179+
"Transform conditional discard", false, false)
180+
181+
FunctionPass *llvm::createAMDGPUConditionalDiscardPass() {
182+
return new AMDGPUConditionalDiscard();
183+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,13 @@ static cl::opt<bool> EnableAtomicOptimizations(
165165
cl::init(false),
166166
cl::Hidden);
167167

168+
// Enable conditional discard transformations
169+
static cl::opt<bool> EnableConditionalDiscardTransformations(
170+
"amdgpu-conditional-discard-transformations",
171+
cl::desc("Enable conditional discard transformations"),
172+
cl::init(false),
173+
cl::Hidden);
174+
168175
// Enable Mode register optimization
169176
static cl::opt<bool> EnableSIModeRegisterPass(
170177
"amdgpu-mode-register",
@@ -215,6 +222,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
215222
initializeAMDGPUAnnotateUniformValuesPass(*PR);
216223
initializeAMDGPUArgumentUsageInfoPass(*PR);
217224
initializeAMDGPUAtomicOptimizerPass(*PR);
225+
initializeAMDGPUConditionalDiscardPass(*PR);
218226
initializeAMDGPULowerKernelArgumentsPass(*PR);
219227
initializeAMDGPULowerKernelAttributesPass(*PR);
220228
initializeAMDGPULowerIntrinsicsPass(*PR);
@@ -848,6 +856,7 @@ bool GCNPassConfig::addPreISel() {
848856
addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
849857
}
850858
addPass(createSinkingPass());
859+
851860
// This is a temporary fix for the issue of dealing with in loop uniform values
852861
// where the uses out of the loop are non-uniform. LCSSA creates a PHI at the
853862
// loop exit such that can be marked divergent and can be passed onto ISEL
@@ -856,6 +865,10 @@ bool GCNPassConfig::addPreISel() {
856865
// therefore can't be preserved in LCSSA as needed. The linking/preserving
857866
// outside of the same library needs to be resolved in llvm core code.
858867
addPass(createLCSSAPass());
868+
869+
if (EnableConditionalDiscardTransformations)
870+
addPass(createAMDGPUConditionalDiscardPass());
871+
859872
addPass(createAMDGPUAnnotateUniformValues());
860873
if (!LateCFGStructurize) {
861874
addPass(createSIAnnotateControlFlowPass());

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
4040
AMDGPUArgumentUsageInfo.cpp
4141
AMDGPUAsmPrinter.cpp
4242
AMDGPUAtomicOptimizer.cpp
43+
AMDGPUConditionalDiscard.cpp
4344
AMDGPUCallLowering.cpp
4445
AMDGPUCodeGenPrepare.cpp
4546
AMDGPUFixFunctionBitcasts.cpp
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
; RUN: llc -amdgpu-conditional-discard-transformations=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2+
3+
; Check that the branch is removed by the discard opt.
4+
5+
; GCN-LABEL: {{^}}if_with_kill_true_cond:
6+
; GCN: v_cmp_ne_u32_e32 vcc,
7+
; GCN-NEXT: s_and_b64 exec, exec, vcc
8+
; GCN-NOT: branch
9+
define amdgpu_ps void @if_with_kill_true_cond(i32 %arg) {
10+
.entry:
11+
%cmp = icmp eq i32 %arg, 32
12+
br i1 %cmp, label %then, label %endif
13+
14+
then:
15+
tail call void @llvm.amdgcn.kill(i1 false)
16+
br label %endif
17+
18+
endif:
19+
ret void
20+
}
21+
22+
; Check that the branch is removed by the discard opt.
23+
24+
; GCN-LABEL: {{^}}if_with_kill_false_cond:
25+
; GCN: v_cmp_eq_u32_e32 vcc,
26+
; GCN-NEXT: s_and_b64 exec, exec, vcc
27+
; GCN-NOT: branch
28+
define amdgpu_ps void @if_with_kill_false_cond(i32 %arg) {
29+
.entry:
30+
%cmp = icmp eq i32 %arg, 32
31+
br i1 %cmp, label %endif, label %then
32+
33+
then:
34+
tail call void @llvm.amdgcn.kill(i1 false)
35+
br label %endif
36+
37+
endif:
38+
ret void
39+
}
40+
41+
; Check that the branch exiting the loop is a divergent one (s_cbranch_vccnz).
42+
; This test exercises a loop with kill as the only exit.
43+
44+
; GCN-LABEL: {{^}}kill_with_loop_exit:
45+
; GCN: s_cbranch_vccnz
46+
; GCN: s_cbranch_vccnz
47+
define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, <4 x i32> inreg %inp2, float inreg %inp3) {
48+
.entry:
49+
%tmp24 = fcmp olt float %inp0, 1.280000e+02
50+
%tmp25 = fcmp olt float %inp1, 1.280000e+02
51+
%tmp26 = and i1 %tmp24, %tmp25
52+
br i1 %tmp26, label %bb35, label %.preheader1.preheader
53+
54+
.preheader1.preheader: ; preds = %.entry
55+
%tmp31 = fcmp ogt float %inp3, 0.0
56+
br label %bb
57+
58+
bb: ; preds = %bb, %.preheader1.preheader
59+
%tmp30 = phi float [ %tmp32, %bb ], [ 1.500000e+00, %.preheader1.preheader ]
60+
%tmp32 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01
61+
%tmp34 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01
62+
br i1 %tmp31, label %bb, label %bb33
63+
64+
bb33: ; preds = %bb
65+
call void @llvm.amdgcn.kill(i1 false)
66+
br label %bb35
67+
68+
bb35: ; preds = %bb33, %.entry
69+
%tmp36 = phi float [ %tmp34, %bb33 ], [ 1.000000e+00, %.entry ]
70+
call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float %tmp36, float %tmp36, float %tmp36, float %tmp36, i1 immarg true, i1 immarg true) #3
71+
ret void
72+
}
73+
74+
; Check that the kill inside a loop is not optimized away.
75+
76+
; GCN-LABEL: {{^}}if_with_loop_kill_after:
77+
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],
78+
; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]]
79+
define amdgpu_ps void @if_with_loop_kill_after(i32 %arg) {
80+
.entry:
81+
%cmp = icmp eq i32 %arg, 32
82+
br i1 %cmp, label %then, label %endif
83+
84+
then:
85+
%sub = sub i32 %arg, 1
86+
br label %loop
87+
88+
loop:
89+
%ind = phi i32 [%sub, %then], [%dec, %loop]
90+
%dec = sub i32 %ind, 1
91+
%cc = icmp ne i32 %ind, 0
92+
br i1 %cc, label %loop, label %break
93+
94+
break:
95+
tail call void @llvm.amdgcn.kill(i1 false)
96+
br label %endif
97+
98+
endif:
99+
ret void
100+
}
101+
102+
; Check that the kill inside a loop is not optimized away.
103+
104+
; GCN-LABEL: {{^}}if_with_kill_inside_loop:
105+
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],
106+
; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]]
107+
define amdgpu_ps void @if_with_kill_inside_loop(i32 %arg) {
108+
.entry:
109+
%cmp = icmp eq i32 %arg, 32
110+
br i1 %cmp, label %then, label %endif
111+
112+
then:
113+
%sub = sub i32 %arg, 1
114+
br label %loop
115+
116+
loop:
117+
%ind = phi i32 [%sub, %then], [%dec, %loop]
118+
%dec = sub i32 %ind, 1
119+
%cc = icmp ne i32 %ind, 0
120+
tail call void @llvm.amdgcn.kill(i1 false)
121+
br i1 %cc, label %loop, label %break
122+
123+
break:
124+
br label %endif
125+
126+
endif:
127+
ret void
128+
}
129+
130+
attributes #0 = { nounwind }
131+
132+
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
133+
declare void @llvm.amdgcn.kill(i1) #0
134+

llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1+
; RUN: llc -amdgpu-conditional-discard-transformations=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22

33
; GCN-LABEL: {{^}}if_with_kill:
44
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],

llvm/test/CodeGen/AMDGPU/skip-if-dead.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1+
; RUN: llc -amdgpu-conditional-discard-transformations=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
22

33
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
44
; CHECK-NEXT: ; %bb.0:

0 commit comments

Comments
 (0)