Skip to content

Commit 80e004f

Browse files
committed
[AMDGPU] Add transformation of kills to demotes in simple scenarios
Convert conditional kills to demotes. Enable this using -amdgpu-conditional-discard-transformations and -amdgpu-transform-discard-to-demote combined. V2: simplify options V3: remove extraneous change details and improve comments V4: fix pass naming Change-Id: Ibe152dadd4728462855fe8a413ea55c41e981f1c
1 parent ca9afc9 commit 80e004f

File tree

3 files changed

+98
-13
lines changed

3 files changed

+98
-13
lines changed

llvm/lib/Target/AMDGPU/AMDGPUConditionalDiscard.cpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@
4545
/// The pass should ideally be placed after code sinking, because some sinking
4646
/// opportunities get lost after the transformation due to the basic block
4747
/// removal.
48+
///
49+
/// Additionally this pass can be used to transform kill intrinsics optimized
50+
/// as above to demote operations.
51+
/// This provides a workaround for applications which perform a non-uniform
52+
/// "kill" and later compute (implicit) derivatives.
53+
/// Note that in Vulkan, such applications should be fixed to use demote
54+
/// (OpDemoteToHelperInvocationEXT) instead of kill (OpKill).
55+
///
4856

4957
#include "AMDGPU.h"
5058
#include "AMDGPUSubtarget.h"
@@ -59,6 +67,13 @@
5967
using namespace llvm;
6068
using namespace llvm::AMDGPU;
6169

70+
// Enable conditional discard to demote transformations
71+
static cl::opt<bool> EnableTransformDiscardToDemote(
72+
"amdgpu-transform-discard-to-demote",
73+
cl::desc("Enable transformation of optimized discards to demotes"),
74+
cl::init(false),
75+
cl::Hidden);
76+
6277
namespace {
6378

6479
class AMDGPUConditionalDiscard : public FunctionPass {
@@ -78,10 +93,9 @@ class AMDGPUConditionalDiscard : public FunctionPass {
7893
AU.addRequiredTransitive<LoopInfoWrapperPass>();
7994
}
8095

81-
8296
StringRef getPassName() const override { return "AMDGPUConditionalDiscard"; }
8397

84-
void optimizeBlock(BasicBlock &BB);
98+
void optimizeBlock(BasicBlock &BB, bool ConvertToDemote);
8599
};
86100

87101
} // namespace
@@ -94,7 +108,7 @@ char &llvm::AMDGPUConditionalDiscardID = AMDGPUConditionalDiscard::ID;
94108
// first instruction is a call to amdgcn_kill, with "false" as argument.
95109
// Transform the branch condition of the block's predecessor and mark
96110
// the block for removal. Clone the call to amdgcn_kill to the predecessor.
97-
void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
111+
void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB, bool ConvertToDemote) {
98112

99113
if (auto *KillCand = dyn_cast<CallInst>(&BB.front())) {
100114
auto *Callee = KillCand->getCalledFunction();
@@ -111,8 +125,10 @@ void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
111125
return;
112126

113127
// Skip if the kill is in a loop.
114-
if (LI->getLoopFor(PredBlock))
128+
if (LI->getLoopFor(PredBlock)) {
129+
LLVM_DEBUG(dbgs() << "Cannot optimize " << BB.getName() << " due to loop\n");
115130
return;
131+
}
116132

117133
auto *PredTerminator = PredBlock->getTerminator();
118134
auto *PredBranchInst = dyn_cast<BranchInst>(PredTerminator);
@@ -134,6 +150,11 @@ void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
134150

135151
auto *NewKill = cast<CallInst>(KillCand->clone());
136152

153+
if (ConvertToDemote) {
154+
NewKill->setCalledFunction(Intrinsic::getDeclaration(
155+
KillCand->getModule(), Intrinsic::amdgcn_wqm_demote));
156+
}
157+
137158
NewKill->setArgOperand(0, Cond);
138159
NewKill->insertBefore(PredTerminator);
139160

@@ -157,7 +178,7 @@ bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
157178
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
158179

159180
for (auto &BB : F)
160-
optimizeBlock(BB);
181+
optimizeBlock(BB, EnableTransformDiscardToDemote);
161182

162183
for (auto *BB : KillBlocksToRemove) {
163184
for (auto *Succ : successors(BB)) {
@@ -173,10 +194,10 @@ bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
173194
}
174195

175196
INITIALIZE_PASS_BEGIN(AMDGPUConditionalDiscard, DEBUG_TYPE,
176-
"Transform conditional discard", false, false)
197+
"AMDGPUConditionalDiscard", false, false)
177198
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
178199
INITIALIZE_PASS_END(AMDGPUConditionalDiscard, DEBUG_TYPE,
179-
"Transform conditional discard", false, false)
200+
"AMDGPUConditionalDiscard", false, false)
180201

181202
FunctionPass *llvm::createAMDGPUConditionalDiscardPass() {
182203
return new AMDGPUConditionalDiscard();

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,9 @@ bool GCNPassConfig::addPreISel() {
849849
// FIXME: We need to run a pass to propagate the attributes when calls are
850850
// supported.
851851

852+
if (EnableConditionalDiscardTransformations)
853+
addPass(createAMDGPUConditionalDiscardPass());
854+
852855
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
853856
// regions formed by them.
854857
addPass(&AMDGPUUnifyDivergentExitNodesID);
@@ -866,9 +869,6 @@ bool GCNPassConfig::addPreISel() {
866869
// outside of the same library needs to be resolved in llvm core code.
867870
addPass(createLCSSAPass());
868871

869-
if (EnableConditionalDiscardTransformations)
870-
addPass(createAMDGPUConditionalDiscardPass());
871-
872872
addPass(createAMDGPUAnnotateUniformValues());
873873
if (!LateCFGStructurize) {
874874
addPass(createSIAnnotateControlFlowPass());

llvm/test/CodeGen/AMDGPU/discard-optimization.ll

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
; RUN: llc -amdgpu-conditional-discard-transformations=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1+
; RUN: llc -amdgpu-conditional-discard-transformations=1 --march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,KILL %s
2+
; RUN: llc -amdgpu-conditional-discard-transformations=1 -amdgpu-transform-discard-to-demote --march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEMOTE %s
23

34
; Check that the branch is removed by the discard opt.
45

56
; GCN-LABEL: {{^}}if_with_kill_true_cond:
67
; GCN: v_cmp_ne_u32_e32 vcc,
78
; GCN-NEXT: s_and_b64 exec, exec, vcc
8-
; GCN-NOT: branch
99
define amdgpu_ps void @if_with_kill_true_cond(i32 %arg) {
1010
.entry:
1111
%cmp = icmp eq i32 %arg, 32
@@ -24,7 +24,6 @@ endif:
2424
; GCN-LABEL: {{^}}if_with_kill_false_cond:
2525
; GCN: v_cmp_eq_u32_e32 vcc,
2626
; GCN-NEXT: s_and_b64 exec, exec, vcc
27-
; GCN-NOT: branch
2827
define amdgpu_ps void @if_with_kill_false_cond(i32 %arg) {
2928
.entry:
3029
%cmp = icmp eq i32 %arg, 32
@@ -127,8 +126,73 @@ endif:
127126
ret void
128127
}
129128

129+
130+
; GCN-LABEL: {{^}}wqm_kill_to_demote1:
131+
; GCN-NEXT: ; %.entry
132+
; GCN: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
133+
; GCN: s_wqm_b64 exec, exec
134+
; DEMOTE: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
135+
; GCN: image_sample
136+
; GCN: v_add_f32_e32
137+
; DEMOTE: s_and_b64 exec, exec, [[LIVE]]
138+
; KILL: s_and_b64 exec, exec, [[ORIG]]
139+
; GCN: image_sample
140+
define amdgpu_ps <4 x float> @wqm_kill_to_demote1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
141+
.entry:
142+
%z.cmp = fcmp olt float %z, 0.0
143+
br i1 %z.cmp, label %.continue, label %.kill
144+
145+
.kill:
146+
call void @llvm.amdgcn.kill(i1 false)
147+
br label %.export
148+
149+
.continue:
150+
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
151+
%tex0 = extractelement <4 x float> %tex, i32 0
152+
%tex1 = extractelement <4 x float> %tex, i32 0
153+
%coord1 = fadd float %tex0, %tex1
154+
%rtex.src = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
155+
br label %.export
156+
157+
.export:
158+
%rtex = phi <4 x float> [ undef, %.kill ], [ %rtex.src, %.continue ]
159+
ret <4 x float> %rtex
160+
}
161+
162+
163+
; GCN-LABEL: {{^}}wqm_kill_to_demote2:
164+
; GCN-NEXT: ; %.entry
165+
; GCN: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
166+
; GCN: s_wqm_b64 exec, exec
167+
; GCN: image_sample
168+
; DEMOTE: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
169+
; GCN: v_add_f32_e32
170+
; DEMOTE: s_and_b64 exec, exec, [[LIVE]]
171+
; KILL: s_and_b64 exec, exec, [[ORIG]]
172+
; GCN: image_sample
173+
define amdgpu_ps <4 x float> @wqm_kill_to_demote2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
174+
.entry:
175+
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
176+
%tex0 = extractelement <4 x float> %tex, i32 0
177+
%tex1 = extractelement <4 x float> %tex, i32 0
178+
%z.cmp = fcmp olt float %tex0, 0.0
179+
br i1 %z.cmp, label %.continue, label %.kill
180+
181+
.kill:
182+
call void @llvm.amdgcn.kill(i1 false)
183+
br label %.continue
184+
185+
.continue:
186+
%coord1 = fadd float %tex0, %tex1
187+
%rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
188+
189+
ret <4 x float> %rtex
190+
}
191+
130192
attributes #0 = { nounwind }
193+
attributes #1 = { nounwind readnone }
131194

132195
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
133196
declare void @llvm.amdgcn.kill(i1) #0
197+
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
134198

0 commit comments

Comments
 (0)