-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU][SIPreEmitPeephole] pre-commit tests: mustRetainExeczBranch: use a cost model #109816
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][SIPreEmitPeephole] pre-commit tests: mustRetainExeczBranch: use a cost model #109816
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Juan Manuel Martinez Caamaño (jmmartinez) ChangesPatch is 20.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109816.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index aca8225cebb3fd..563ce402fd44fe 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -30,6 +30,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
@@ -47,6 +48,7 @@
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
#include <cassert>
+#include <optional>
#include <utility>
using namespace llvm;
@@ -85,7 +87,46 @@ using PhiMap = MapVector<PHINode *, BBValueVector>;
using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
-using BBPredicates = DenseMap<BasicBlock *, Value *>;
+
+using MaybeCondBranchWeights = std::optional<class CondBranchWeights>;
+
+class CondBranchWeights {
+ uint32_t TrueWeight;
+ uint32_t FalseWeight;
+
+public:
+ CondBranchWeights(unsigned T, unsigned F) : TrueWeight(T), FalseWeight(F) {}
+
+ static MaybeCondBranchWeights tryParse(const BranchInst &Br) {
+ assert(Br.isConditional());
+
+ SmallVector<uint32_t, 2> Weights;
+ if (!extractBranchWeights(Br, Weights))
+ return std::nullopt;
+
+ if (Weights.size() != 2)
+ return std::nullopt;
+
+ return CondBranchWeights{Weights[0], Weights[1]};
+ }
+
+ static void setMetadata(BranchInst &Br,
+ MaybeCondBranchWeights const &Weights) {
+ assert(Br.isConditional());
+ if (!Weights)
+ return;
+ uint32_t Arr[] = {Weights->TrueWeight, Weights->FalseWeight};
+ setBranchWeights(Br, Arr, false);
+ }
+
+ CondBranchWeights invert() const {
+ return CondBranchWeights{FalseWeight, TrueWeight};
+ }
+};
+
+using ValueWeightPair = std::pair<Value *, MaybeCondBranchWeights>;
+
+using BBPredicates = DenseMap<BasicBlock *, ValueWeightPair>;
using PredMap = DenseMap<BasicBlock *, BBPredicates>;
using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
@@ -271,7 +312,7 @@ class StructurizeCFG {
void analyzeLoops(RegionNode *N);
- Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+ ValueWeightPair buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
void gatherPredicates(RegionNode *N);
@@ -449,16 +490,22 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
}
/// Build the condition for one edge
-Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
- bool Invert) {
+ValueWeightPair StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+ bool Invert) {
Value *Cond = Invert ? BoolFalse : BoolTrue;
+ MaybeCondBranchWeights Weights = std::nullopt;
+
if (Term->isConditional()) {
Cond = Term->getCondition();
+ Weights = CondBranchWeights::tryParse(*Term);
- if (Idx != (unsigned)Invert)
+ if (Idx != (unsigned)Invert) {
Cond = invertCondition(Cond);
+ if (Weights)
+ Weights = Weights->invert();
+ }
}
- return Cond;
+ return {Cond, Weights};
}
/// Analyze the predecessors of each block and build up predicates
@@ -490,8 +537,8 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
if (Visited.count(Other) && !Loops.count(Other) &&
!Pred.count(Other) && !Pred.count(P)) {
- Pred[Other] = BoolFalse;
- Pred[P] = BoolTrue;
+ Pred[Other] = {BoolFalse, std::nullopt};
+ Pred[P] = {BoolTrue, std::nullopt};
continue;
}
}
@@ -512,9 +559,9 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
BasicBlock *Entry = R->getEntry();
if (Visited.count(Entry))
- Pred[Entry] = BoolTrue;
+ Pred[Entry] = {BoolTrue, std::nullopt};
else
- LPred[Entry] = BoolFalse;
+ LPred[Entry] = {BoolFalse, std::nullopt};
}
}
}
@@ -578,12 +625,14 @@ void StructurizeCFG::insertConditions(bool Loops) {
Dominator.addBlock(Parent);
Value *ParentValue = nullptr;
- for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+ MaybeCondBranchWeights ParentWeights = std::nullopt;
+ for (std::pair<BasicBlock *, ValueWeightPair> BBAndPred : Preds) {
BasicBlock *BB = BBAndPred.first;
- Value *Pred = BBAndPred.second;
+ Value *Pred = BBAndPred.second.first;
if (BB == Parent) {
ParentValue = Pred;
+ ParentWeights = BBAndPred.second.second;
break;
}
PhiInserter.AddAvailableValue(BB, Pred);
@@ -592,6 +641,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
if (ParentValue) {
Term->setCondition(ParentValue);
+ CondBranchWeights::setMetadata(*Term, ParentWeights);
} else {
if (!Dominator.resultIsRememberedBlock())
PhiInserter.AddAvailableValue(Dominator.result(), Default);
@@ -607,7 +657,7 @@ void StructurizeCFG::simplifyConditions() {
for (auto &I : concat<PredMap::value_type>(Predicates, LoopPreds)) {
auto &Preds = I.second;
for (auto &J : Preds) {
- auto &Cond = J.second;
+ auto &Cond = J.second.first;
Instruction *Inverted;
if (match(Cond, m_Not(m_OneUse(m_Instruction(Inverted)))) &&
!Cond->use_empty()) {
@@ -904,9 +954,10 @@ void StructurizeCFG::setPrevNode(BasicBlock *BB) {
/// Does BB dominate all the predicates of Node?
bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
- return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
- return DT->dominates(BB, Pred.first);
- });
+ return llvm::all_of(Preds,
+ [&](std::pair<BasicBlock *, ValueWeightPair> Pred) {
+ return DT->dominates(BB, Pred.first);
+ });
}
/// Can we predict that this node will always be called?
@@ -918,9 +969,9 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
if (!PrevNode)
return true;
- for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+ for (std::pair<BasicBlock *, ValueWeightPair> Pred : Preds) {
BasicBlock *BB = Pred.first;
- Value *V = Pred.second;
+ Value *V = Pred.second.first;
if (V != BoolTrue)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
new file mode 100644
index 00000000000000..33865c04b3fe92
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX1030 %s
+
+define void @convergent_cmp_no_metadata(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX10-LABEL: convergent_cmp_no_metadata:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lt_i32 s21, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB0_2
+; GFX10-NEXT: ; %bb.1: ; %if.then
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_mov_b32_e32 v1, s19
+; GFX10-NEXT: s_mov_b32 s11, s18
+; GFX10-NEXT: s_mov_b32 s10, s17
+; GFX10-NEXT: s_mov_b32 s9, s16
+; GFX10-NEXT: s_mov_b32 s8, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT: .LBB0_2: ; %if.end
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %cmp = icmp sgt i32 %flag, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @convergent_cmp_unprofitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX10-LABEL: convergent_cmp_unprofitable:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lt_i32 s21, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX10-NEXT: ; %bb.1: ; %if.then
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_mov_b32_e32 v1, s19
+; GFX10-NEXT: s_mov_b32 s11, s18
+; GFX10-NEXT: s_mov_b32 s10, s17
+; GFX10-NEXT: s_mov_b32 s9, s16
+; GFX10-NEXT: s_mov_b32 s8, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT: .LBB1_2: ; %if.end
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %cmp = icmp sgt i32 %flag, 0
+ br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @convergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX10-LABEL: convergent_cmp_profitable:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lt_i32 s21, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB2_2
+; GFX10-NEXT: ; %bb.1: ; %if.then
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_mov_b32_e32 v1, s19
+; GFX10-NEXT: s_mov_b32 s11, s18
+; GFX10-NEXT: s_mov_b32 s10, s17
+; GFX10-NEXT: s_mov_b32 s9, s16
+; GFX10-NEXT: s_mov_b32 s8, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT: .LBB2_2: ; %if.end
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %cmp = icmp sgt i32 %flag, 0
+ br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @divergent_cmp_no_metadata(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX1010-LABEL: divergent_cmp_no_metadata:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: s_cbranch_execz .LBB3_2
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: .LBB3_2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_cmp_no_metadata:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
+; GFX1030-NEXT: s_cbranch_execz .LBB3_2
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: .LBB3_2: ; %if.end
+; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp sgt i32 %flag, %id
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @divergent_cmp_unprofitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX1010-LABEL: divergent_cmp_unprofitable:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: s_cbranch_execz .LBB4_2
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: .LBB4_2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_cmp_unprofitable:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
+; GFX1030-NEXT: s_cbranch_execz .LBB4_2
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: .LBB4_2: ; %if.end
+; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp sgt i32 %flag, %id
+ br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @divergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX1010-LABEL: divergent_cmp_profitable:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: s_cbranch_execz .LBB5_2
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: .LBB5_2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_cmp_profitable:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
+; GFX1030-NEXT: s_cbranch_execz .LBB5_2
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: .LBB5_2: ; %if.end
+; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp sgt i32 %flag, %id
+ br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg)
+declare void @llvm.amdgcn.s.waitcnt(i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+!0 = !{!"branch_weights", i32 1000, i32 1000}
+!1 = !{!"branch_weights", i32 2000, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll b/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
new file mode 100644
index 00000000000000..d036d6cbca7b9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_ps i32 @if_else(i32 %0) {
+; OPT-LABEL: define amdgpu_ps i32 @if_else(
+; OPT-SAME: i32 [[TMP0:%.*]]) {
+; OPT-NEXT: [[C:%.*]] = icmp ne i32 [[TMP0]], 0
+; OPT-NEXT: br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]], !prof [[PROF0:![0-9]+]]
+; OPT: [[FLOW]]:
+; OPT-NEXT: [[TMP2:%.*]] = phi i32 [ 33, %[[FALSE]] ], [ undef, [[TMP1:%.*]] ]
+; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[FALSE]] ], [ true, [[TMP1]] ]
+; OPT-NEXT: br i1 [[TMP3]], label %[[TRUE:.*]], label %[[EXIT:.*]]
+; OPT: [[TRUE]]:
+; OPT-NEXT: br label %[[EXIT]]
+; OPT: [[FALSE]]:
+; OPT-NEXT: br label %[[FLOW]]
+; OPT: [[EXIT]]:
+; OPT-NEXT: [[RET:%.*]] = phi i32 [ [[TMP2]], %[[FLOW]] ], [ 42, %[[TRUE]] ]
+; OPT-NEXT: ret i32 [[RET]]
+;
+ %c = icmp eq i32 %0, 0
+ br i1 %c, label %true, label %false, !prof !0
+
+true: ; preds = %1
+ br label %exit
+
+false: ; preds = %1
+ br label %exit
+
+exit: ; preds = %false, %true
+ %ret = phi i32 [ 42, %true ], [ 33, %false ]
+ ret i32 %ret
+}
+
+define amdgpu_ps void @loop_if_break(i32 %n) {
+; OPT-LABEL: define amdgpu_ps void @loop_if_break(
+; OPT-SAME: i32 [[N:%.*]]) {
+; OPT-NEXT: [[ENTRY:.*]]:
+; OPT-NEXT: br label %[[LOOP:.*]]
+; OPT: [[LOOP]]:
+; OPT-NEXT: [[I:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FLOW:.*]] ]
+; OPT-NEXT: [[C:%.*]] = icmp ugt i32 [[I]], 0
+; OPT-NEXT: br i1 [[C]], label %[[LOOP_BODY:.*]], label %[[FLOW]], !prof [[PROF1:![0-9]+]]
+; OPT: [[LOOP_BODY]]:
+; OPT-NEXT: [[I_NEXT:%.*]] = sub i32 [[I]], 1
+; OPT-NEXT: br label %[[FLOW]]
+; OPT: [[FLOW]]:
+; OPT-NEXT: [[TMP0]] = phi i32 [ [[I_NEXT]], %[[LOOP_BODY]] ], [ undef, %[[LOOP]] ]
+; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[LOOP_BODY]] ], [ true, %[[LOOP]] ]
+; OPT-NEXT: br i1 [[TMP1]], label %[[EXIT:.*]], label %[[LOOP]]
+; OPT: [[EXIT]]:
+; OPT-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop_body, %entry
+ %i = phi i32 [ %n, %entry ], [ %i.next, %loop_body ]
+ %...
[truncated]
|
Only look a the last commit. PR for the first commits in pre-commit tests and implementation |
3422bab
to
2454c68
Compare
2454c68
to
1a442e1
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/177/builds/5642 Here is the relevant piece of the build log for the reference
|
…use a cost model (llvm#109816)
No description provided.