[AMDGPU] Add transformation of kills to demotes in simple scenarios

perlfu · perlfu · commit 80e004f6b881 · 2020-03-12T10:54:51.000+09:00
Convert conditional kills to demotes.
Enable this using -amdgpu-conditional-discard-transformations
and -amdgpu-transform-discard-to-demote combined.

V2: simplify options
V3: remove extraneous change details and improve comments
V4: fix pass naming

Change-Id: Ibe152dadd4728462855fe8a413ea55c41e981f1c
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUConditionalDiscard.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConditionalDiscard.cpp
@@ -45,6 +45,14 @@
 /// The pass should ideally be placed after code sinking, because some sinking
 /// opportunities get lost after the transformation due to the basic block
 /// removal.
+///
+/// Additionally this pass can be used to transform kill intrinsics optimized
+/// as above to demote operations.
+/// This provides a workaround for applications which perform a non-uniform
+/// "kill" and later compute (implicit) derivatives.
+/// Note that in Vulkan, such applications should be fixed to use demote
+/// (OpDemoteToHelperInvocationEXT) instead of kill (OpKill).
+///
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
@@ -59,6 +67,13 @@
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
+// Enable conditional discard to demote transformations
+static cl::opt<bool> EnableTransformDiscardToDemote(
+  "amdgpu-transform-discard-to-demote",
+  cl::desc("Enable transformation of optimized discards to demotes"),
+  cl::init(false),
+  cl::Hidden);
+
 namespace {
 
 class AMDGPUConditionalDiscard : public FunctionPass {
@@ -78,10 +93,9 @@ class AMDGPUConditionalDiscard : public FunctionPass {
      AU.addRequiredTransitive<LoopInfoWrapperPass>();
   }
 
-
   StringRef getPassName() const override { return "AMDGPUConditionalDiscard"; }
 
-  void optimizeBlock(BasicBlock &BB);
+  void optimizeBlock(BasicBlock &BB, bool ConvertToDemote);
 };
 
 } // namespace
@@ -94,7 +108,7 @@ char &llvm::AMDGPUConditionalDiscardID = AMDGPUConditionalDiscard::ID;
 // first instruction is a call to amdgcn_kill, with "false" as argument.
 // Transform the branch condition of the block's predecessor and mark
 // the block for removal. Clone the call to amdgcn_kill to the predecessor.
-void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
+void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB, bool ConvertToDemote) {
 
   if (auto *KillCand = dyn_cast<CallInst>(&BB.front())) {
     auto *Callee = KillCand->getCalledFunction();
@@ -111,8 +125,10 @@ void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
       return;
 
     // Skip if the kill is in a loop.
-    if (LI->getLoopFor(PredBlock))
+    if (LI->getLoopFor(PredBlock)) {
+      LLVM_DEBUG(dbgs() << "Cannot optimize " << BB.getName() << " due to loop\n");
       return;
+    }
 
     auto *PredTerminator = PredBlock->getTerminator();
     auto *PredBranchInst = dyn_cast<BranchInst>(PredTerminator);
@@ -134,6 +150,11 @@ void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
 
     auto *NewKill = cast<CallInst>(KillCand->clone());
 
+    if (ConvertToDemote) {
+      NewKill->setCalledFunction(Intrinsic::getDeclaration(
+          KillCand->getModule(), Intrinsic::amdgcn_wqm_demote));
+    }
+
     NewKill->setArgOperand(0, Cond);
     NewKill->insertBefore(PredTerminator);
 
@@ -157,7 +178,7 @@ bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   for (auto &BB : F)
-    optimizeBlock(BB);
+    optimizeBlock(BB, EnableTransformDiscardToDemote);
 
   for (auto *BB : KillBlocksToRemove) {
     for (auto *Succ : successors(BB)) {
@@ -173,10 +194,10 @@ bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPUConditionalDiscard, DEBUG_TYPE,
-                "Transform conditional discard", false, false)
+                "AMDGPUConditionalDiscard", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUConditionalDiscard, DEBUG_TYPE,
-                "Transform conditional discard", false, false)
+                "AMDGPUConditionalDiscard", false, false)
 
 FunctionPass *llvm::createAMDGPUConditionalDiscardPass() {
   return new AMDGPUConditionalDiscard();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -849,6 +849,9 @@ bool GCNPassConfig::addPreISel() {
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
+  if (EnableConditionalDiscardTransformations)
+    addPass(createAMDGPUConditionalDiscardPass());
+
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
@@ -866,9 +869,6 @@ bool GCNPassConfig::addPreISel() {
   // outside of the same library needs to be resolved in llvm core code.
   addPass(createLCSSAPass());
 
-  if (EnableConditionalDiscardTransformations)
-    addPass(createAMDGPUConditionalDiscardPass());
-
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());
diff --git a/llvm/test/CodeGen/AMDGPU/discard-optimization.ll b/llvm/test/CodeGen/AMDGPU/discard-optimization.ll
@@ -1,11 +1,11 @@
-; RUN: llc -amdgpu-conditional-discard-transformations=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-conditional-discard-transformations=1 --march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,KILL %s
+; RUN: llc -amdgpu-conditional-discard-transformations=1 -amdgpu-transform-discard-to-demote --march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEMOTE %s
 
 ; Check that the branch is removed by the discard opt.
 
 ; GCN-LABEL: {{^}}if_with_kill_true_cond:
 ; GCN:      v_cmp_ne_u32_e32 vcc,
 ; GCN-NEXT: s_and_b64 exec, exec, vcc
-; GCN-NOT: branch
 define amdgpu_ps void @if_with_kill_true_cond(i32 %arg) {
 .entry:
   %cmp = icmp eq i32 %arg, 32
@@ -24,7 +24,6 @@ endif:
 ; GCN-LABEL: {{^}}if_with_kill_false_cond:
 ; GCN:      v_cmp_eq_u32_e32 vcc,
 ; GCN-NEXT: s_and_b64 exec, exec, vcc
-; GCN-NOT: branch
 define amdgpu_ps void @if_with_kill_false_cond(i32 %arg) {
 .entry:
   %cmp = icmp eq i32 %arg, 32
@@ -127,8 +126,73 @@ endif:
   ret void
 }
 
+
+; GCN-LABEL: {{^}}wqm_kill_to_demote1:
+; GCN-NEXT: ; %.entry
+; GCN: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN: s_wqm_b64 exec, exec
+; DEMOTE: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
+; GCN: image_sample
+; GCN: v_add_f32_e32
+; DEMOTE: s_and_b64 exec, exec, [[LIVE]]
+; KILL: s_and_b64 exec, exec, [[ORIG]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_kill_to_demote1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.kill
+
+.kill:
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %.export
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex.src = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  br label %.export
+
+.export:
+  %rtex = phi <4 x float> [ undef, %.kill ], [ %rtex.src, %.continue ]
+  ret <4 x float> %rtex
+}
+
+
+; GCN-LABEL: {{^}}wqm_kill_to_demote2:
+; GCN-NEXT: ; %.entry
+; GCN: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN: s_wqm_b64 exec, exec
+; GCN: image_sample
+; DEMOTE: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
+; GCN: v_add_f32_e32
+; DEMOTE: s_and_b64 exec, exec, [[LIVE]]
+; KILL: s_and_b64 exec, exec, [[ORIG]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_kill_to_demote2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.kill
+
+.kill:
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
 attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
 
 declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
 declare void @llvm.amdgcn.kill(i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1