Skip to content

Commit b33c807

Browse files
authored
[AMDGPU] Add MaxMemoryClauseSchedStrategy (#114957)
Also expose an option to choose custom scheduler strategy: amdgpu-sched-strategy={max-ilp|max-memory-clause} This can be set through either function attribute or command line option. The major behaviors of the max memory clause schedule strategy includes: 1. Try to cluster memory instructions more aggressively. 2. Try to schedule long latency load earlier than short latency instruction. I tested locally against about 470 real shaders and got the perf changes (only count perf changes over +/-10%): About 15 shaders improved 10%~40%. Only 3 shaders drops ~10%. (This was tested together with another change which increases the maximum clustered dword from 8 to 32). I will make another change to make that threshold configurable.
1 parent a4506bb commit b33c807

File tree

6 files changed

+654
-9
lines changed

6 files changed

+654
-9
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -428,10 +428,10 @@ static cl::opt<bool>
428428
cl::desc("Enable loop data prefetch on AMDGPU"),
429429
cl::Hidden, cl::init(false));
430430

431-
static cl::opt<bool> EnableMaxIlpSchedStrategy(
432-
"amdgpu-enable-max-ilp-scheduling-strategy",
433-
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
434-
cl::Hidden, cl::init(false));
431+
static cl::opt<std::string>
432+
AMDGPUSchedStrategy("amdgpu-sched-strategy",
433+
cl::desc("Select custom AMDGPU scheduling strategy."),
434+
cl::Hidden, cl::init(""));
435435

436436
static cl::opt<bool> EnableRewritePartialRegUses(
437437
"amdgpu-enable-rewrite-partial-reg-uses",
@@ -567,6 +567,18 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
567567
return DAG;
568568
}
569569

570+
static ScheduleDAGInstrs *
571+
createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
572+
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
573+
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
574+
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
575+
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
576+
if (ST.shouldClusterStores())
577+
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
578+
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
579+
return DAG;
580+
}
581+
570582
static ScheduleDAGInstrs *
571583
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
572584
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
@@ -607,6 +619,10 @@ static MachineSchedRegistry
607619
GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
608620
createGCNMaxILPMachineScheduler);
609621

622+
static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
623+
"gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
624+
createGCNMaxMemoryClauseMachineScheduler);
625+
610626
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
611627
"gcn-iterative-max-occupancy-experimental",
612628
"Run GCN scheduler to maximize occupancy (experimental)",
@@ -1294,9 +1310,18 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
12941310
if (ST.enableSIScheduler())
12951311
return createSIMachineScheduler(C);
12961312

1297-
if (EnableMaxIlpSchedStrategy)
1313+
Attribute SchedStrategyAttr =
1314+
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
1315+
StringRef SchedStrategy = SchedStrategyAttr.isValid()
1316+
? SchedStrategyAttr.getValueAsString()
1317+
: AMDGPUSchedStrategy;
1318+
1319+
if (SchedStrategy == "max-ilp")
12981320
return createGCNMaxILPMachineScheduler(C);
12991321

1322+
if (SchedStrategy == "max-memory-clause")
1323+
return createGCNMaxMemoryClauseMachineScheduler(C);
1324+
13001325
return createGCNMaxOccupancyMachineScheduler(C);
13011326
}
13021327

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,138 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
615615
return false;
616616
}
617617

618+
GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
619+
const MachineSchedContext *C)
620+
: GCNSchedStrategy(C) {
621+
SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
622+
}
623+
624+
/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
625+
/// much as possible. This is achieved by:
626+
// 1. Prioritize clustered operations before stall latency heuristic.
627+
// 2. Prioritize long-latency-load before stall latency heuristic.
628+
///
629+
/// \param Cand provides the policy and current best candidate.
630+
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
631+
/// \param Zone describes the scheduled zone that we are extending, or nullptr
632+
/// if Cand is from a different zone than TryCand.
633+
/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
634+
bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
635+
SchedCandidate &TryCand,
636+
SchedBoundary *Zone) const {
637+
// Initialize the candidate if needed.
638+
if (!Cand.isValid()) {
639+
TryCand.Reason = NodeOrder;
640+
return true;
641+
}
642+
643+
// Bias PhysReg Defs and copies to their uses and defined respectively.
644+
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
645+
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
646+
return TryCand.Reason != NoCand;
647+
648+
if (DAG->isTrackingPressure()) {
649+
// Avoid exceeding the target's limit.
650+
if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
651+
RegExcess, TRI, DAG->MF))
652+
return TryCand.Reason != NoCand;
653+
654+
// Avoid increasing the max critical pressure in the scheduled region.
655+
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
656+
TryCand, Cand, RegCritical, TRI, DAG->MF))
657+
return TryCand.Reason != NoCand;
658+
}
659+
660+
// MaxMemoryClause-specific: We prioritize clustered instructions as we would
661+
// get more benefit from clausing these memory instructions.
662+
const SUnit *CandNextClusterSU =
663+
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
664+
const SUnit *TryCandNextClusterSU =
665+
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
666+
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
667+
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
668+
return TryCand.Reason != NoCand;
669+
670+
// We only compare a subset of features when comparing nodes between
671+
// Top and Bottom boundary. Some properties are simply incomparable, in many
672+
// other instances we should only override the other boundary if something
673+
// is a clear good pick on one boundary. Skip heuristics that are more
674+
// "tie-breaking" in nature.
675+
bool SameBoundary = Zone != nullptr;
676+
if (SameBoundary) {
677+
// For loops that are acyclic path limited, aggressively schedule for
678+
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
679+
// heuristics to take precedence.
680+
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
681+
tryLatency(TryCand, Cand, *Zone))
682+
return TryCand.Reason != NoCand;
683+
684+
// MaxMemoryClause-specific: Prioritize long latency memory load
685+
// instructions in top-bottom order to hide more latency. The mayLoad check
686+
// is used to exclude store-like instructions, which we do not want to
687+
// scheduler them too early.
688+
bool TryMayLoad =
689+
TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
690+
bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
691+
692+
if (TryMayLoad || CandMayLoad) {
693+
bool TryLongLatency =
694+
TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
695+
bool CandLongLatency =
696+
10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
697+
698+
if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
699+
Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
700+
Cand, Stall))
701+
return TryCand.Reason != NoCand;
702+
}
703+
// Prioritize instructions that read unbuffered resources by stall cycles.
704+
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
705+
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
706+
return TryCand.Reason != NoCand;
707+
}
708+
709+
if (SameBoundary) {
710+
// Weak edges are for clustering and other constraints.
711+
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
712+
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
713+
return TryCand.Reason != NoCand;
714+
}
715+
716+
// Avoid increasing the max pressure of the entire region.
717+
if (DAG->isTrackingPressure() &&
718+
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
719+
Cand, RegMax, TRI, DAG->MF))
720+
return TryCand.Reason != NoCand;
721+
722+
if (SameBoundary) {
723+
// Avoid critical resource consumption and balance the schedule.
724+
TryCand.initResourceDelta(DAG, SchedModel);
725+
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
726+
TryCand, Cand, ResourceReduce))
727+
return TryCand.Reason != NoCand;
728+
if (tryGreater(TryCand.ResDelta.DemandedResources,
729+
Cand.ResDelta.DemandedResources, TryCand, Cand,
730+
ResourceDemand))
731+
return TryCand.Reason != NoCand;
732+
733+
// Avoid serializing long latency dependence chains.
734+
// For acyclic path limited loops, latency was already checked above.
735+
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
736+
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
737+
return TryCand.Reason != NoCand;
738+
739+
// Fall through to original instruction order.
740+
if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
741+
assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
742+
TryCand.Reason = NodeOrder;
743+
return true;
744+
}
745+
}
746+
747+
return false;
748+
}
749+
618750
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
619751
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
620752
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@@ -644,6 +776,9 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
644776
return std::make_unique<PreRARematStage>(SchedStageID, *this);
645777
case GCNSchedStageID::ILPInitialSchedule:
646778
return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
779+
case GCNSchedStageID::MemoryClauseInitialSchedule:
780+
return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
781+
*this);
647782
}
648783

649784
llvm_unreachable("Unknown SchedStageID.");
@@ -869,6 +1004,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
8691004
case GCNSchedStageID::ILPInitialSchedule:
8701005
OS << "Max ILP Initial Schedule";
8711006
break;
1007+
case GCNSchedStageID::MemoryClauseInitialSchedule:
1008+
OS << "Max memory clause Initial Schedule";
1009+
break;
8721010
}
8731011

8741012
return OS;
@@ -1088,7 +1226,8 @@ void GCNSchedStage::setupNewBlock() {
10881226
// Get real RP for the region if it hasn't be calculated before. After the
10891227
// initial schedule stage real RP will be collected after scheduling.
10901228
if (StageID == GCNSchedStageID::OccInitialSchedule ||
1091-
StageID == GCNSchedStageID::ILPInitialSchedule)
1229+
StageID == GCNSchedStageID::ILPInitialSchedule ||
1230+
StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
10921231
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
10931232
}
10941233

@@ -1389,6 +1528,11 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
13891528
return false;
13901529
}
13911530

1531+
bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
1532+
unsigned WavesAfter) {
1533+
return mayCauseSpilling(WavesAfter);
1534+
}
1535+
13921536
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
13931537
if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
13941538
!PressureAfter.less(MF, PressureBefore)) {

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ enum class GCNSchedStageID : unsigned {
2929
UnclusteredHighRPReschedule = 1,
3030
ClusteredLowOccupancyReschedule = 2,
3131
PreRARematerialize = 3,
32-
ILPInitialSchedule = 4
32+
ILPInitialSchedule = 4,
33+
MemoryClauseInitialSchedule = 5
3334
};
3435

3536
#ifndef NDEBUG
@@ -149,6 +150,17 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
149150
GCNMaxILPSchedStrategy(const MachineSchedContext *C);
150151
};
151152

153+
/// The goal of this scheduling strategy is to maximize memory clause for a
154+
/// single wave.
155+
class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
156+
protected:
157+
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
158+
SchedBoundary *Zone) const override;
159+
160+
public:
161+
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
162+
};
163+
152164
class ScheduleMetrics {
153165
unsigned ScheduleLength;
154166
unsigned BubbleCycles;
@@ -463,6 +475,15 @@ class ILPInitialScheduleStage : public GCNSchedStage {
463475
: GCNSchedStage(StageID, DAG) {}
464476
};
465477

478+
class MemoryClauseInitialScheduleStage : public GCNSchedStage {
479+
public:
480+
bool shouldRevertScheduling(unsigned WavesAfter) override;
481+
482+
MemoryClauseInitialScheduleStage(GCNSchedStageID StageID,
483+
GCNScheduleDAGMILive &DAG)
484+
: GCNSchedStage(StageID, DAG) {}
485+
};
486+
466487
class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
467488
private:
468489
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

0 commit comments

Comments
 (0)