[AMDGPU] Optimize conditional discard

piotrAMD · perlfu · commit ca9afc9e8b0d · 2020-03-11T10:28:01.000+09:00
Add a new IR level pass to the AMDGPU backend to optimize
conditional discard in the pixel shaders.

This pass transforms conditional discard of the form:

if (condition)
  discard (false);

into:

discard (!condition);

The transformation is useful, because removing basic blocks simplifies CFG
and limits the number of phi nodes used.

Change-Id: Ie6eaa42618f999e431e8ac514f0f2535ce26c93e
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -91,6 +91,10 @@ FunctionPass *createAMDGPUAtomicOptimizerPass();
 void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
 extern char &AMDGPUAtomicOptimizerID;
 
+FunctionPass *createAMDGPUConditionalDiscardPass();
+void initializeAMDGPUConditionalDiscardPass(PassRegistry &);
+extern char &AMDGPUConditionalDiscardID;
+
 ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUConditionalDiscard.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConditionalDiscard.cpp
@@ -0,0 +1,183 @@
+//===-- AMDGPUConditionalDiscard.cpp
+//-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass transforms conditional discard of the form:
+///
+///     if (condition)
+///       discard (false);
+///
+/// into:
+///
+///     discard (!condition);
+///
+///
+/// More specifically,
+///
+/// ...
+/// block:
+///   %cond = icmp eq i32 %a, %b
+///   br i1 %cond, label %kill_block, label %cont_block
+///
+/// kill_block:
+///   call void @llvm.amdgcn.kill(i1 false)
+///   br label %other
+/// ...
+///
+/// gets transformed into:
+///
+/// ...
+/// block:
+///   %cond = icmp eq i32 %a, %b
+///   %nonkill = not i1 %cond
+///   call void @llvm.amdgcn.kill(i1 %nonkill)
+///   br label %cont_block
+/// ...
+///
+/// The transformation is useful, because removing basic blocks simplifies CFG
+/// and limits the number of phi nodes used.
+/// The pass should ideally be placed after code sinking, because some sinking
+/// opportunities get lost after the transformation due to the basic block
+/// removal.
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-conditional-discard"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+namespace {
+
+class AMDGPUConditionalDiscard : public FunctionPass {
+private:
+  SmallVector<BasicBlock *, 4> KillBlocksToRemove;
+
+  const LoopInfo *LI;
+
+public:
+  static char ID;
+
+  AMDGPUConditionalDiscard() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+     AU.addRequiredTransitive<LoopInfoWrapperPass>();
+  }
+
+
+  StringRef getPassName() const override { return "AMDGPUConditionalDiscard"; }
+
+  void optimizeBlock(BasicBlock &BB);
+};
+
+} // namespace
+
+char AMDGPUConditionalDiscard::ID = 0;
+
+char &llvm::AMDGPUConditionalDiscardID = AMDGPUConditionalDiscard::ID;
+
+// Look for a basic block that has only a single predecessor and its
+// first instruction is a call to amdgcn_kill, with "false" as argument.
+// Transform the branch condition of the block's predecessor and mark
+// the block for removal. Clone the call to amdgcn_kill to the predecessor.
+void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
+
+  if (auto *KillCand = dyn_cast<CallInst>(&BB.front())) {
+    auto *Callee = KillCand->getCalledFunction();
+    if (!Callee || Callee->getIntrinsicID() != Intrinsic::amdgcn_kill) {
+      return;
+    }
+
+    ConstantInt *Val = dyn_cast<ConstantInt>(KillCand->getOperand(0));
+    if (!Val || !Val->isZero())
+      return;
+
+    auto *PredBlock = BB.getSinglePredecessor();
+    if (!PredBlock)
+      return;
+
+    // Skip if the kill is in a loop.
+    if (LI->getLoopFor(PredBlock))
+      return;
+
+    auto *PredTerminator = PredBlock->getTerminator();
+    auto *PredBranchInst = dyn_cast<BranchInst>(PredTerminator);
+
+    if (!PredBranchInst || !PredBranchInst->isConditional())
+      return;
+
+    BasicBlock *LiveBlock = nullptr;
+    auto *Cond = PredBranchInst->getCondition();
+
+    if (PredBranchInst->getSuccessor(0) == &BB) {
+      // The old kill block could only be reached if
+      // the condition was true - negate the condition.
+      Cond = BinaryOperator::CreateNot(Cond, "", PredTerminator);
+      LiveBlock = PredBranchInst->getSuccessor(1);
+    } else {
+      LiveBlock = PredBranchInst->getSuccessor(0);
+    }
+
+    auto *NewKill = cast<CallInst>(KillCand->clone());
+
+    NewKill->setArgOperand(0, Cond);
+    NewKill->insertBefore(PredTerminator);
+
+    KillBlocksToRemove.push_back(&BB);
+
+    // Change the branch to an unconditional one, targeting the live block.
+    auto *NewBranchInst = BranchInst::Create(LiveBlock, PredBranchInst);
+    NewBranchInst->copyMetadata(*PredBranchInst);
+    PredBranchInst->eraseFromParent();
+  }
+}
+
+bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
+  if (F.getCallingConv() != CallingConv::AMDGPU_PS)
+    return false;
+
+  if (skipFunction(F)) {
+    return false;
+  }
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  for (auto &BB : F)
+    optimizeBlock(BB);
+
+  for (auto *BB : KillBlocksToRemove) {
+    for (auto *Succ : successors(BB)) {
+      for (PHINode &PN : Succ->phis())
+        PN.removeIncomingValue(BB);
+    }
+    BB->eraseFromParent();
+  }
+  bool Ret = !KillBlocksToRemove.empty();
+  KillBlocksToRemove.clear();
+
+  return Ret;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUConditionalDiscard, DEBUG_TYPE,
+                "Transform conditional discard", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUConditionalDiscard, DEBUG_TYPE,
+                "Transform conditional discard", false, false)
+
+FunctionPass *llvm::createAMDGPUConditionalDiscardPass() {
+  return new AMDGPUConditionalDiscard();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -165,6 +165,13 @@ static cl::opt<bool> EnableAtomicOptimizations(
   cl::init(false),
   cl::Hidden);
 
+// Enable conditional discard transformations
+static cl::opt<bool> EnableConditionalDiscardTransformations(
+  "amdgpu-conditional-discard-transformations",
+  cl::desc("Enable conditional discard transformations"),
+  cl::init(false),
+  cl::Hidden);
+
 // Enable Mode register optimization
 static cl::opt<bool> EnableSIModeRegisterPass(
   "amdgpu-mode-register",
@@ -215,6 +222,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
+  initializeAMDGPUConditionalDiscardPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
@@ -848,6 +856,7 @@ bool GCNPassConfig::addPreISel() {
     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   }
   addPass(createSinkingPass());
+
   // This is a temporary fix for the issue of dealing with in loop uniform values
   // where the uses out of the loop are non-uniform. LCSSA creates a PHI at the
   // loop exit such that can be marked divergent and can be passed onto ISEL
@@ -856,6 +865,10 @@ bool GCNPassConfig::addPreISel() {
   // therefore can't be preserved in LCSSA as needed. The linking/preserving
   // outside of the same library needs to be resolved in llvm core code.
   addPass(createLCSSAPass());
+
+  if (EnableConditionalDiscardTransformations)
+    addPass(createAMDGPUConditionalDiscardPass());
+
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUArgumentUsageInfo.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUAtomicOptimizer.cpp
+  AMDGPUConditionalDiscard.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUFixFunctionBitcasts.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/discard-optimization.ll b/llvm/test/CodeGen/AMDGPU/discard-optimization.ll
@@ -0,0 +1,134 @@
+; RUN: llc -amdgpu-conditional-discard-transformations=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that the branch is removed by the discard opt.
+
+; GCN-LABEL: {{^}}if_with_kill_true_cond:
+; GCN:      v_cmp_ne_u32_e32 vcc,
+; GCN-NEXT: s_and_b64 exec, exec, vcc
+; GCN-NOT: branch
+define amdgpu_ps void @if_with_kill_true_cond(i32 %arg) {
+.entry:
+  %cmp = icmp eq i32 %arg, 32
+  br i1 %cmp, label %then, label %endif
+
+then:
+  tail call void @llvm.amdgcn.kill(i1 false)
+  br label %endif
+
+endif:
+  ret void
+}
+
+; Check that the branch is removed by the discard opt.
+
+; GCN-LABEL: {{^}}if_with_kill_false_cond:
+; GCN:      v_cmp_eq_u32_e32 vcc,
+; GCN-NEXT: s_and_b64 exec, exec, vcc
+; GCN-NOT: branch
+define amdgpu_ps void @if_with_kill_false_cond(i32 %arg) {
+.entry:
+  %cmp = icmp eq i32 %arg, 32
+  br i1 %cmp, label %endif, label %then
+
+then:
+  tail call void @llvm.amdgcn.kill(i1 false)
+  br label %endif
+
+endif:
+  ret void
+}
+
+; Check that the branch exiting the loop is a divergent one (s_cbranch_vccnz).
+; This test exercises a loop with kill as the only exit.
+
+; GCN-LABEL: {{^}}kill_with_loop_exit:
+; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_vccnz
+define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, <4 x i32> inreg %inp2, float inreg %inp3) {
+.entry:
+  %tmp24 = fcmp olt float %inp0, 1.280000e+02
+  %tmp25 = fcmp olt float %inp1, 1.280000e+02
+  %tmp26 = and i1 %tmp24, %tmp25
+  br i1 %tmp26, label %bb35, label %.preheader1.preheader
+
+.preheader1.preheader:                            ; preds = %.entry
+  %tmp31 = fcmp ogt float %inp3, 0.0
+  br label %bb
+
+bb:                                               ; preds = %bb, %.preheader1.preheader
+  %tmp30 = phi float [ %tmp32, %bb ], [ 1.500000e+00, %.preheader1.preheader ]
+  %tmp32 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01
+  %tmp34 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01
+  br i1 %tmp31, label %bb, label %bb33
+
+bb33:                                             ; preds = %bb
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %bb35
+
+bb35:                                             ; preds = %bb33, %.entry
+  %tmp36 = phi float [ %tmp34, %bb33 ], [ 1.000000e+00, %.entry ]
+  call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float %tmp36, float %tmp36, float %tmp36, float %tmp36, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+; Check that the kill inside a loop is not optimized away.
+
+; GCN-LABEL: {{^}}if_with_loop_kill_after:
+; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],
+; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]]
+define amdgpu_ps void @if_with_loop_kill_after(i32 %arg) {
+.entry:
+  %cmp = icmp eq i32 %arg, 32
+  br i1 %cmp, label %then, label %endif
+
+then:
+  %sub = sub i32 %arg, 1
+  br label %loop
+
+loop:
+  %ind = phi i32 [%sub, %then], [%dec, %loop]
+  %dec = sub i32 %ind, 1
+  %cc = icmp ne i32 %ind, 0
+  br i1 %cc, label %loop, label %break
+
+break:
+  tail call void @llvm.amdgcn.kill(i1 false)
+  br label %endif
+
+endif:
+  ret void
+}
+
+; Check that the kill inside a loop is not optimized away.
+
+; GCN-LABEL: {{^}}if_with_kill_inside_loop:
+; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],
+; GCN-NEXT: s_xor_b64 s[{{[0-9:]+}}], exec, [[SAVEEXEC]]
+define amdgpu_ps void @if_with_kill_inside_loop(i32 %arg) {
+.entry:
+  %cmp = icmp eq i32 %arg, 32
+  br i1 %cmp, label %then, label %endif
+
+then:
+  %sub = sub i32 %arg, 1
+  br label %loop
+
+loop:
+  %ind = phi i32 [%sub, %then], [%dec, %loop]
+  %dec = sub i32 %ind, 1
+  %cc = icmp ne i32 %ind, 0
+  tail call void @llvm.amdgcn.kill(i1 false)
+  br i1 %cc, label %loop, label %break
+
+break:
+  br label %endif
+
+endif:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
+declare void @llvm.amdgcn.kill(i1) #0
+
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-conditional-discard-transformations=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}if_with_kill:
 ; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-conditional-discard-transformations=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
 ; CHECK-NEXT: ; %bb.0:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`
	`1`	`+; RUN: llc -amdgpu-conditional-discard-transformations=0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`
`2`	`2`
`3`	`3`	`; GCN-LABEL: {{^}}if_with_kill:`
`4`	`4`	`; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],`