-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU][NPM] Support -regalloc-npm options #129035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-backend-amdgpu Author: Akshat Oke (optimisan) ChangesRegalloc options for AMDGPU are Full diff: https://github.com/llvm/llvm-project/pull/129035.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7bf6e8f671db8..100f3e26a368e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -77,6 +77,7 @@
#include "llvm/CodeGen/MachineLICM.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocFast.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -188,6 +189,24 @@ static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for WWM registers"));
+static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocTypeNPM(
+ "sgpr-regalloc-npm", cl::Hidden,
+ cl::desc("Register allocator to use for SGPRs in new pass "
+ "manager"),
+ cl::init(RegAllocType::Default));
+
+static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocTypeNPM(
+ "vgpr-regalloc-npm", cl::Hidden,
+ cl::desc("Register allocator to use for VGPRs in new pass "
+ "manager"),
+ cl::init(RegAllocType::Default));
+
+static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocTypeNPM(
+ "wwm-regalloc-npm", cl::Hidden,
+ cl::desc("Register allocator to use for WWM registers in "
+ "new pass manager"),
+ cl::init(RegAllocType::Default));
+
static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -2140,6 +2159,215 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
addPass(SIShrinkInstructionsPass());
}
+static const char NPMRegAllocOptNotSupportedMessage[] =
+ "-regalloc-npm not supported with amdgcn. Use -sgpr-regalloc-npm, "
+ "-wwm-regalloc-npm, "
+ "and -vgpr-regalloc-npm";
+
+// void AMDGPUCodeGenPassBuilder::addSGPRRegAlloc(AddMachinePass &addPass,
+// RegAllocType RAType, RegAllocFilterFunc FilterFunc, bool Optimized) const {
+// RegAllocType RAType = RegAllocTypeNPM;
+// if (RAType == RegAllocType::Default) {
+// RAType = Optimized ? RegAllocType::Greedy : RegAllocType::Fast;
+// }
+
+// if (RAType == RegAllocType::Greedy) {
+// addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}));
+// return;
+// }
+
+// if (RAType == RegAllocType::Fast) {
+// addPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}));
+// return;
+// }
+// report_fatal_error("Unsupported SGPR regalloc type", false);
+// }
+
+// template<typename RegAllocPass>
+// void AMDGPUCodeGenPassBuilder::addRegAllocOfType(AddMachinePass &addPass,
+// RegAllocPass::Options Options) {
+// addPass(RegAllocPass(Options));
+// }
+
+// this is the final method
+// template<typename RegAllocPass>
+// void AMDGPUCodeGenPassBuilder::addRegAllocOfType(AddMachinePass &addPass,
+// RegAllocPhase Phase) {
+// #define RA_OPTIONS(FilterFunc, Name, ClearVirtRegs) \
+// [&]() { \
+// if constexpr (std::is_same_v<RegAllocPass, RegAllocFastPass>) { \
+// return RegAllocFastPass::Options{FilterFunc, Name, ClearVirtRegs}; \
+// } else { \
+// return typename RegAllocPass::Options{FilterFunc, Name}; \
+// } \
+// }()
+
+// typename RegAllocPass::Options Options;
+// RegAllocType RAType;
+
+// switch (Phase) {
+// case RegAllocPhase::SGPR:
+// Options = RA_OPTIONS(onlyAllocateSGPRs, "sgpr", false);
+// RAType = SGPRRegAllocTypeNPM;
+// break;
+// case RegAllocPhase::WWM:
+// Options = RA_OPTIONS(onlyAllocateWWMRegs, "wwm", false);
+// RAType = WWMRegAllocTypeNPM;
+// break;
+// case RegAllocPhase::VGPR:
+// Options = RA_OPTIONS(onlyAllocateVGPRs, "vgpr", true);
+// RAType = VGPRRegAllocTypeNPM;
+// break;
+// };
+
+// switch(RAType) {
+// case RegAllocType::Greedy:
+// addPass(RAGreedyPass(Options));
+// return;
+// case RegAllocType::Fast:
+// addPass(RegAllocFastPass(Options));
+// return;
+// case RegAllocType::Unset:
+// addPass(RegAllocPass(Options));
+// }
+// #undef RA_OPTIONS
+// }
+
+// template<typename RegAllocPass>
+// void AMDGPUCodeGenPassBuilder::addRegAlloc(AddMachinePass &addPass,
+// RegAllocPhase Phase) {
+// RegAllocType RAType;
+// switch(Phase) {
+// case RegAllocPhase::SGPR:
+// RAType = SGPRRegAllocTypeNPM;
+// break;
+// case RegAllocPhase::WWM:
+// RAType = WWMRegAllocTypeNPM;
+// break;
+// case RegAllocPhase::VGPR:
+// RAType = VGPRRegAllocTypeNPM;
+// break;
+// }
+// switch (RAType) {
+// case RegAllocType::Greedy:
+// addRegAllocOfType(addPass, Phase);
+// }
+// addRegAllocOfType<RegAllocPass>(addPass, Phase);
+// }
+
+template <typename RegAllocPassT>
+typename RegAllocPassT::Options
+AMDGPUCodeGenPassBuilder::getRAOptionsForPhase(RegAllocPhase Phase) const {
+#define RA_OPTIONS(FilterFunc, Name, ClearVirtRegs) \
+ [&]() { \
+ if constexpr (std::is_same_v<RegAllocPassT, RegAllocFastPass>) { \
+ return RegAllocFastPass::Options{FilterFunc, Name, ClearVirtRegs}; \
+ } else { \
+ return typename RegAllocPassT::Options{FilterFunc, Name}; \
+ } \
+ }()
+
+ switch (Phase) {
+ case RegAllocPhase::SGPR:
+ return RA_OPTIONS(onlyAllocateSGPRs, "sgpr", false);
+ case RegAllocPhase::WWM:
+ return RA_OPTIONS(onlyAllocateWWMRegs, "wwm", false);
+ case RegAllocPhase::VGPR:
+ return RA_OPTIONS(onlyAllocateVGPRs, "vgpr", true);
+ }
+ // static_assert(std::is_same_v<PhaseT, SGPRPhase> ||
+ // std::is_same_v<PhaseT, WWMPhase> ||
+ // std::is_same_v<PhaseT, VGPRPhase>,
+ // "Unsupported phase type");
+
+ // if constexpr(std::is_same_v<PhaseT, SGPRPhase>) {
+ // return RA_OPTIONS(onlyAllocateSGPRs, "sgpr", false);
+ // } else if constexpr(std::is_same_v<PhaseT, WWMPhase>) {
+ // return RA_OPTIONS(onlyAllocateWWMRegs, "wwm", false);
+ // } else if constexpr(std::is_same_v<PhaseT, VGPRPhase>) {
+ // return RA_OPTIONS(onlyAllocateVGPRs, "vgpr", true);
+ // }
+
+#undef RA_OPTIONS
+}
+
+template <typename RegAllocPassT>
+void AMDGPUCodeGenPassBuilder::addRegAlloc(AddMachinePass &addPass,
+ RegAllocPhase Phase) const {
+ RegAllocType RAType;
+ // Read the appropriate phase's regalloc type.
+ switch (Phase) {
+ case RegAllocPhase::SGPR:
+ RAType = SGPRRegAllocTypeNPM;
+ break;
+ case RegAllocPhase::WWM:
+ RAType = WWMRegAllocTypeNPM;
+ break;
+ case RegAllocPhase::VGPR:
+ RAType = VGPRRegAllocTypeNPM;
+ break;
+ }
+
+ // Construct the pass with the appropriate options.
+ switch (RAType) {
+ case RegAllocType::Greedy:
+ addPass(RAGreedyPass(getRAOptionsForPhase<RAGreedyPass>(Phase)));
+ return;
+ case RegAllocType::Fast:
+ addPass(RegAllocFastPass(getRAOptionsForPhase<RegAllocFastPass>(Phase)));
+ return;
+ case RegAllocType::Unset:
+ case RegAllocType::Default:
+ addPass(RegAllocPassT(getRAOptionsForPhase<RegAllocPassT>(Phase)));
+ return;
+ default:
+ report_fatal_error("Unsupported regalloc type for AMDGPU", false);
+ }
+}
+
+Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
+ AddMachinePass &addPass) const {
+ if (Opt.RegAlloc != RegAllocType::Unset)
+ return make_error<StringError>(NPMRegAllocOptNotSupportedMessage,
+ inconvertibleErrorCode());
+
+ addPass(GCNPreRALongBranchRegPass());
+
+ addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::SGPR);
+
+ // Commit allocated register changes. This is mostly necessary because too
+ // many things rely on the use lists of the physical registers, such as the
+ // verifier. This is only necessary with allocators which use LiveIntervals,
+ // since FastRegAlloc does the replacements itself.
+ // TODO: addPass(VirtRegRewriterPass(false));
+
+ // At this point, the sgpr-regalloc has been done and it is good to have the
+ // stack slot coloring to try to optimize the SGPR spill stack indices before
+ // attempting the custom SGPR spill lowering.
+ addPass(StackSlotColoringPass());
+
+ // Equivalent of PEI for SGPRs.
+ addPass(SILowerSGPRSpillsPass());
+
+ // To Allocate wwm registers used in whole quad mode operations (for shaders).
+ addPass(SIPreAllocateWWMRegsPass());
+
+ // For allocating other wwm register operands.
+ addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::WWM);
+ addPass(SILowerWWMCopiesPass());
+ // TODO: addPass(VirtRegRewriterPass(false));
+ // TODO: addPass(AMDGPUReserveWWMRegsPass());
+
+ // For allocating per-thread VGPRs.
+ addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::VGPR);
+
+ // TODO: addPreRewrite();
+ addPass(VirtRegRewriterPass(false));
+
+ // TODO: addPass(AMDGPUMarkLastScratchLoadPass());
+ return Error::success();
+}
+
void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
addPass(SIFixVGPRCopiesPass());
if (TM.getOptLevel() > CodeGenOptLevel::None)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3df4115324ac2..dffb53be44d4d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -16,9 +16,11 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Passes/CodeGenPassBuilder.h"
+#include "llvm/Target/CGPassBuilderOption.h"
#include <optional>
#include <utility>
@@ -179,6 +181,7 @@ class AMDGPUCodeGenPassBuilder
Error addInstSelector(AddMachinePass &) const;
void addPreRewrite(AddMachinePass &) const;
void addMachineSSAOptimization(AddMachinePass &) const;
+ Error addRegAssignmentOptimized(AddMachinePass &) const;
void addPostRegAlloc(AddMachinePass &) const;
void addPreEmitPass(AddMachinePass &) const;
@@ -189,6 +192,38 @@ class AMDGPUCodeGenPassBuilder
CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
void addEarlyCSEOrGVNPass(AddIRPass &) const;
void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
+
+private:
+ // /// Dummy structs to represent different phases of register allocation.
+ // struct SGPRPhase{};
+ // struct VGPRPhase{};
+ // struct WWMPhase{};
+ enum class RegAllocPhase { SGPR, VGPR, WWM };
+
+ template <typename RegAllocPassT>
+ typename RegAllocPassT::Options getRAOptionsForPhase(RegAllocPhase) const;
+
+ /// \brief Add register allocation pass to the pass manager.
+ /// This checks for the regalloc type given through
+ /// -{phase}-regalloc-npm={type} cl option. If the option is not specified, it
+ /// uses the preferred regalloc pass type.
+ ///
+ /// \tparam PreferredRegAllocPassT The fallback reg alloc pass type to use if
+ /// cl::opt is unspecified.
+ /// \param Phase The phase of register allocation to add.
+ template <typename PreferredRegAllocPassT>
+ void addRegAlloc(AddMachinePass &, RegAllocPhase Phase) const;
+
+ // instantiate the template for each phase
+ /// Add the register allocation pass for given filter func and type
+ /// (greedy/fast).
+ /// @param Type If RegAllocType::Default, add according to the optimization
+ /// level.
+ // void addRegAllocPass(AddMachinePass &, RegAllocType Type,
+ // RegAllocFilterFunc FilterFunc) const;
+ void addSGPRRegAlloc(AddMachinePass &, bool Optimized) const;
+ void addWWMRegAlloc(AddMachinePass &, bool Optimized) const;
+ void addVGPRRegAlloc(AddMachinePass &, bool Optimized) const;
};
} // end namespace llvm
diff --git a/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir b/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir
index 07f2d350ffd9c..2398dc816acc3 100644
--- a/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir
+++ b/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir
@@ -2,11 +2,16 @@
# RUN: llc -mtriple=amdgcn --passes='regallocfast<filter=sgpr>,regallocfast<filter=wwm>,regallocfast<filter=vgpr>' --print-pipeline-passes --filetype=null %s | FileCheck %s --check-prefix=PASS
# RUN: not llc -mtriple=amdgcn --passes='regallocfast<filter=bad-filter>' --print-pipeline-passes --filetype=null %s 2>&1 | FileCheck %s --check-prefix=BAD-FILTER
+# RUN: llc -mtriple=amdgcn -enable-new-pm -sgpr-regalloc-npm=greedy -vgpr-regalloc-npm=fast -print-pipeline-passes %s | FileCheck %s --check-prefix=NPM-PASS
+
+
# PASS: regallocfast<filter=sgpr>
# PASS: regallocfast<filter=wwm>
# PASS: regallocfast<filter=vgpr>
# BAD-FILTER: invalid regallocfast register filter 'bad-filter'
+# NPM-PASS: greedy<sgpr>
+# NPM-PASS: regallocfast<filter=vgpr>
---
name: f
...
|
537f807
to
592423d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
# PASS: regallocfast<filter=sgpr> | ||
# PASS: regallocfast<filter=wwm> | ||
# PASS: regallocfast<filter=vgpr> | ||
# BAD-FILTER: invalid regallocfast register filter 'bad-filter' | ||
|
||
# NPM-PASS: greedy<sgpr> | ||
# NPM-PASS: regallocfast<filter=wwm;no-clear-vregs> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this option no-clear-vregs
is exposed to the commandline? This was originally an internal flag to control the vreg clearing for targets requiring multiple regalloc pipelines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How else would you test this part of the pipeline standalone
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't remember seeing a command line option for doing it in the legacy path. So it's something new we're introducing in the NPM?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The only way you can do it now is -start-before / stop-after now. This would allow you to have the explicit passes in the run line
This comment was marked as off-topic.
This comment was marked as off-topic.
I would prefer generic regalloc pipeline tunning options, X86 backend also has the same feature request, but it is related to generic builder design. I'm fine with this change if AMDGPU maintainers approve it. |
I suppose we would need a TargetMachine hook to populate the filter functions and names to do that |
Is there any generic design proposal for regalloc pipeline? I remember something like We could wait till the final design of the TargetPassBuilder is determined, but here is my understanding of this: Adding regalloc passes would be done with a generic method void CodeGenPassBuilder::addRegAssignmentAndRewriteOptimized() {
addPass(...);
...
addRegAllocPass<RAGreedyPass>();
...
addRegAllocPass<RAGreedyPass>();
...
addRegAllocPass<RAGreedyPass>();
...
} Targets have to specify which filters (and options) to use for which phase (that are now numbered) void CodeGenPassBuilder::addRegAllocPass<SelectedRAPassType>( &MFPM ) {
static unsigned phase = 1;
if(cliRAPipeline)
// add whatever is in -regalloc-npm option
else
MFPM.addPass(SelectedRAPassType(getTargetRAPhaseFilter(phase)));
phase++;
} |
19c5f91
to
b85a8ef
Compare
This stack of pull requests is managed by Graphite. Learn more about stacking. |
f20b44e
to
f69e5df
Compare
@optimisan, support the generic style |
Regalloc options for AMDGPU are
-{phase}-regalloc-npm={type}
wherephase=sgpr|vgpr|wwm
andtype=greedy|fast...
(common regalloc pass types)