Skip to content

Commit 53eb0f8

Browse files
committed
[AMDGPU] Attempt to reschedule withou clustering
We want to have more load/store clustering but we also want to maintain low register pressure which are oposit targets. Allow scheduler to reschedule regions without mutations applied if we hit a register limit. Differential Revision: https://reviews.llvm.org/D73386
1 parent 9771122 commit 53eb0f8

File tree

3 files changed

+99
-18
lines changed

3 files changed

+99
-18
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -316,13 +316,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
316316
ST(MF.getSubtarget<GCNSubtarget>()),
317317
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
318318
StartingOccupancy(MFI.getOccupancy()),
319-
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
319+
MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
320320

321321
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
322322
}
323323

324324
void GCNScheduleDAGMILive::schedule() {
325-
if (Stage == 0) {
325+
if (Stage == Collect) {
326326
// Just record regions at the first pass.
327327
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
328328
return;
@@ -348,6 +348,7 @@ void GCNScheduleDAGMILive::schedule() {
348348

349349
ScheduleDAGMILive::schedule();
350350
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
351+
RescheduleRegions[RegionIdx] = false;
351352

352353
if (!LIS)
353354
return;
@@ -389,20 +390,28 @@ void GCNScheduleDAGMILive::schedule() {
389390
<< MinOccupancy << ".\n");
390391
}
391392

393+
unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
394+
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
395+
if (PressureAfter.getVGPRNum() > MaxVGPRs ||
396+
PressureAfter.getSGPRNum() > MaxSGPRs)
397+
RescheduleRegions[RegionIdx] = true;
398+
392399
if (WavesAfter >= MinOccupancy) {
393-
unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
394-
unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
395-
if (WavesAfter > MFI.getMinWavesPerEU() ||
400+
if (Stage == UnclusteredReschedule &&
401+
!PressureAfter.less(ST, PressureBefore)) {
402+
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
403+
} else if (WavesAfter > MFI.getMinWavesPerEU() ||
396404
PressureAfter.less(ST, PressureBefore) ||
397-
(TotalVGPRs >= PressureAfter.getVGPRNum() &&
398-
TotalSGPRs >= PressureAfter.getSGPRNum())) {
405+
!RescheduleRegions[RegionIdx]) {
399406
Pressure[RegionIdx] = PressureAfter;
400407
return;
408+
} else {
409+
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
401410
}
402-
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
403411
}
404412

405413
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
414+
RescheduleRegions[RegionIdx] = true;
406415
RegionEnd = RegionBegin;
407416
for (MachineInstr *MI : Unsched) {
408417
if (MI->isDebugInstr())
@@ -532,41 +541,63 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
532541

533542
LiveIns.resize(Regions.size());
534543
Pressure.resize(Regions.size());
544+
RescheduleRegions.resize(Regions.size());
545+
RescheduleRegions.set();
535546

536547
if (!Regions.empty())
537548
BBLiveInMap = getBBLiveInMap();
538549

550+
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
551+
539552
do {
540553
Stage++;
541554
RegionIdx = 0;
542555
MachineBasicBlock *MBB = nullptr;
543556

544-
if (Stage > 1) {
557+
if (Stage > InitialSchedule) {
558+
if (!LIS)
559+
break;
560+
545561
// Retry function scheduling if we found resulting occupancy and it is
546562
// lower than used for first pass scheduling. This will give more freedom
547563
// to schedule low register pressure blocks.
548564
// Code is partially copied from MachineSchedulerBase::scheduleRegions().
549565

550-
if (!LIS || StartingOccupancy <= MinOccupancy)
551-
break;
566+
if (Stage == UnclusteredReschedule) {
567+
if (RescheduleRegions.none())
568+
continue;
569+
LLVM_DEBUG(dbgs() <<
570+
"Retrying function scheduling without clustering.\n");
571+
}
572+
573+
if (Stage == ClusteredLowOccupancyReschedule) {
574+
if (StartingOccupancy <= MinOccupancy)
575+
break;
552576

553-
LLVM_DEBUG(
554-
dbgs()
555-
<< "Retrying function scheduling with lowest recorded occupancy "
556-
<< MinOccupancy << ".\n");
577+
LLVM_DEBUG(
578+
dbgs()
579+
<< "Retrying function scheduling with lowest recorded occupancy "
580+
<< MinOccupancy << ".\n");
557581

558-
S.setTargetOccupancy(MinOccupancy);
582+
S.setTargetOccupancy(MinOccupancy);
583+
}
559584
}
560585

586+
if (Stage == UnclusteredReschedule)
587+
SavedMutations.swap(Mutations);
588+
561589
for (auto Region : Regions) {
590+
if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
591+
continue;
592+
562593
RegionBegin = Region.first;
563594
RegionEnd = Region.second;
564595

565596
if (RegionBegin->getParent() != MBB) {
566597
if (MBB) finishBlock();
567598
MBB = RegionBegin->getParent();
568599
startBlock(MBB);
569-
if (Stage == 1)
600+
if (Stage == InitialSchedule)
570601
computeBlockPressure(MBB);
571602
}
572603

@@ -594,5 +625,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
594625
}
595626
finishBlock();
596627

597-
} while (Stage < 2);
628+
if (Stage == UnclusteredReschedule)
629+
SavedMutations.swap(Mutations);
630+
} while (Stage != LastStage);
598631
}

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,14 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
6464

6565
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
6666

67+
enum : unsigned {
68+
Collect,
69+
InitialSchedule,
70+
UnclusteredReschedule,
71+
ClusteredLowOccupancyReschedule,
72+
LastStage = ClusteredLowOccupancyReschedule
73+
};
74+
6775
const GCNSubtarget &ST;
6876

6977
SIMachineFunctionInfo &MFI;
@@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
8492
SmallVector<std::pair<MachineBasicBlock::iterator,
8593
MachineBasicBlock::iterator>, 32> Regions;
8694

95+
// Records if a region is not yet scheduled, or schedule has been reverted,
96+
// or we generally desire to reschedule it.
97+
BitVector RescheduleRegions;
98+
8799
// Region live-in cache.
88100
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
89101

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2+
3+
; Interleave loads and stores to fit into 9 VGPR limit.
4+
; This requires to avoid load/store clustering.
5+
6+
; GCN: global_load_dwordx4
7+
; GCN: global_store_dwordx4
8+
; GCN: global_load_dwordx4
9+
; GCN: global_store_dwordx4
10+
; GCN: global_load_dwordx4
11+
; GCN: global_store_dwordx4
12+
; GCN: NumVgprs: {{[0-9]$}}
13+
; GCN: ScratchSize: 0{{$}}
14+
15+
define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
16+
bb:
17+
%id = call i32 @llvm.amdgcn.workitem.id.x()
18+
%base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
19+
%tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
20+
%tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
21+
%tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
22+
%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
23+
%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
24+
%tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
25+
store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
26+
%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
27+
store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
28+
%tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
29+
store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
30+
ret void
31+
}
32+
33+
declare i32 @llvm.amdgcn.workitem.id.x() #0
34+
35+
attributes #0 = { nounwind readnone }
36+
attributes #1 = { "amdgpu-num-vgpr"="9" }

0 commit comments

Comments
 (0)