Skip to content

Commit f10dc76

Browse files
authored
[AMDGPU][NPM] Port SIInsertWaitcnts to NPM (#130061)
1 parent 0237216 commit f10dc76

7 files changed

+78
-34
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,13 @@ class AMDGPUMarkLastScratchLoadPass
378378
MachineFunctionAnalysisManager &AM);
379379
};
380380

381+
class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
382+
public:
383+
PreservedAnalyses run(MachineFunction &MF,
384+
MachineFunctionAnalysisManager &MFAM);
385+
static bool isRequired() { return true; }
386+
};
387+
381388
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
382389

383390
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -454,7 +461,7 @@ extern char &AMDGPUInsertDelayAluID;
454461
void initializeSIInsertHardClausesPass(PassRegistry &);
455462
extern char &SIInsertHardClausesID;
456463

457-
void initializeSIInsertWaitcntsPass(PassRegistry&);
464+
void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
458465
extern char &SIInsertWaitcntsID;
459466

460467
void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
111111
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
112112
MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
113113
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
114+
MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
114115
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
115116
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
116117
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
@@ -133,7 +134,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial
133134
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
134135

135136
DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
136-
DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
137137
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
138138
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
139139
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
536536
initializeSIAnnotateControlFlowLegacyPass(*PR);
537537
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
538538
initializeSIInsertHardClausesPass(*PR);
539-
initializeSIInsertWaitcntsPass(*PR);
539+
initializeSIInsertWaitcntsLegacyPass(*PR);
540540
initializeSIModeRegisterLegacyPass(*PR);
541541
initializeSIWholeQuadModeLegacyPass(*PR);
542542
initializeSILowerControlFlowLegacyPass(*PR);
@@ -2158,7 +2158,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
21582158
}
21592159

21602160
addPass(SIMemoryLegalizerPass());
2161-
// TODO: addPass(SIInsertWaitcntsPass());
2161+
addPass(SIInsertWaitcntsPass());
21622162

21632163
// TODO: addPass(SIModeRegisterPass());
21642164

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 63 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/ADT/Sequence.h"
3434
#include "llvm/Analysis/AliasAnalysis.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/MachinePassManager.h"
3637
#include "llvm/CodeGen/MachinePostDominators.h"
3738
#include "llvm/Support/DebugCounter.h"
3839
#include "llvm/TargetParser/TargetParser.h"
@@ -597,7 +598,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
597598
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
598599
};
599600

600-
class SIInsertWaitcnts : public MachineFunctionPass {
601+
class SIInsertWaitcnts {
601602
private:
602603
const GCNSubtarget *ST = nullptr;
603604
const SIInstrInfo *TII = nullptr;
@@ -636,9 +637,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
636637
InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
637638

638639
public:
639-
static char ID;
640-
641-
SIInsertWaitcnts() : MachineFunctionPass(ID) {
640+
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
641+
AliasAnalysis *AA)
642+
: MLI(MLI), PDT(PDT), AA(AA) {
642643
(void)ForceExpCounter;
643644
(void)ForceLgkmCounter;
644645
(void)ForceVMCounter;
@@ -648,20 +649,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
648649
bool isPreheaderToFlush(MachineBasicBlock &MBB,
649650
WaitcntBrackets &ScoreBrackets);
650651
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
651-
bool runOnMachineFunction(MachineFunction &MF) override;
652-
653-
StringRef getPassName() const override {
654-
return "SI insert wait instructions";
655-
}
656-
657-
void getAnalysisUsage(AnalysisUsage &AU) const override {
658-
AU.setPreservesCFG();
659-
AU.addRequired<MachineLoopInfoWrapperPass>();
660-
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
661-
AU.addUsedIfAvailable<AAResultsWrapperPass>();
662-
AU.addPreserved<AAResultsWrapperPass>();
663-
MachineFunctionPass::getAnalysisUsage(AU);
664-
}
652+
bool run(MachineFunction &MF);
665653

666654
bool isForceEmitWaitcnt() const {
667655
for (auto T : inst_counter_types())
@@ -749,6 +737,27 @@ class SIInsertWaitcnts : public MachineFunctionPass {
749737
WaitcntBrackets &ScoreBrackets);
750738
};
751739

740+
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
741+
public:
742+
static char ID;
743+
SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
744+
745+
bool runOnMachineFunction(MachineFunction &MF) override;
746+
747+
StringRef getPassName() const override {
748+
return "SI insert wait instructions";
749+
}
750+
751+
void getAnalysisUsage(AnalysisUsage &AU) const override {
752+
AU.setPreservesCFG();
753+
AU.addRequired<MachineLoopInfoWrapperPass>();
754+
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
755+
AU.addUsedIfAvailable<AAResultsWrapperPass>();
756+
AU.addPreserved<AAResultsWrapperPass>();
757+
MachineFunctionPass::getAnalysisUsage(AU);
758+
}
759+
};
760+
752761
} // end anonymous namespace
753762

754763
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
@@ -1133,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
11331142
return hasMixedPendingEvents(T);
11341143
}
11351144

1136-
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1137-
false)
1145+
INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1146+
false, false)
11381147
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
11391148
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1140-
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1141-
false)
1149+
INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1150+
false, false)
11421151

1143-
char SIInsertWaitcnts::ID = 0;
1152+
char SIInsertWaitcntsLegacy::ID = 0;
11441153

1145-
char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1154+
char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
11461155

11471156
FunctionPass *llvm::createSIInsertWaitcntsPass() {
1148-
return new SIInsertWaitcnts();
1157+
return new SIInsertWaitcntsLegacy();
11491158
}
11501159

11511160
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
@@ -2481,16 +2490,40 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
24812490
return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
24822491
}
24832492

2484-
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2493+
bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2494+
auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2495+
auto *PDT =
2496+
&getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2497+
AliasAnalysis *AA = nullptr;
2498+
if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2499+
AA = &AAR->getAAResults();
2500+
2501+
return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2502+
}
2503+
2504+
PreservedAnalyses
2505+
SIInsertWaitcntsPass::run(MachineFunction &MF,
2506+
MachineFunctionAnalysisManager &MFAM) {
2507+
auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2508+
auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2509+
auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
2510+
.getManager()
2511+
.getCachedResult<AAManager>(MF.getFunction());
2512+
2513+
if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2514+
return PreservedAnalyses::all();
2515+
2516+
return getMachineFunctionPassPreservedAnalyses()
2517+
.preserveSet<CFGAnalyses>()
2518+
.preserve<AAManager>();
2519+
}
2520+
2521+
bool SIInsertWaitcnts::run(MachineFunction &MF) {
24852522
ST = &MF.getSubtarget<GCNSubtarget>();
24862523
TII = ST->getInstrInfo();
24872524
TRI = &TII->getRegisterInfo();
24882525
MRI = &MF.getRegInfo();
24892526
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2490-
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2491-
PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2492-
if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2493-
AA = &AAR->getAAResults();
24942527

24952528
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
24962529

llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
34

45
# $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so
56
# that the return address is not clobbered in the callee by the outstanding load.

llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s
34

45
---
56
name: test

llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
33
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
44
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
5+
6+
# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
57
---
68
# CHECK-LABEL: name: vccz_corrupt_workaround
79
# CHECK: $vcc = V_CMP_EQ_F32

0 commit comments

Comments
 (0)