Skip to content

[AMDGPU] Insert before and after instructions that always use GDS #131338

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,9 @@ bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {

// taken from SIInstrInfo::isAlwaysGDS()
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}

} // namespace llvm::mca
Expand Down
91 changes: 84 additions & 7 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,16 @@ class WaitcntBrackets {
LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
}

bool hasPendingGDS() const {
return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
}

unsigned getPendingGDSWait() const {
return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
}

void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }

// Return true if there might be pending writes to the vgpr-interval by VMEM
// instructions with types different from V.
bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
Expand Down Expand Up @@ -427,6 +437,8 @@ class WaitcntBrackets {
unsigned PendingEvents = 0;
// Remember the last flat memory operation.
unsigned LastFlat[NUM_INST_CNTS] = {0};
// Remember the last GDS operation.
unsigned LastGDS = 0;
// wait_cnt scores for every vgpr.
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
int VgprUB = -1;
Expand Down Expand Up @@ -729,6 +741,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
MachineInstr *OldWaitcntInstr);
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
MachineBasicBlock *Block) const;
bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
};
Expand Down Expand Up @@ -1678,6 +1694,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}
}

// Wait for any pending GDS instruction to complete before any
// "Always GDS" instruction.
if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());

if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
// The function is going to insert a wait on everything in its prolog.
// This still needs to be careful if the call target is a load (e.g. a GOT
Expand Down Expand Up @@ -1982,6 +2003,64 @@ static bool isCacheInvOrWBInst(MachineInstr &Inst) {
Opc == AMDGPU::GLOBAL_WBINV;
}

// Return true if the next instruction is S_ENDPGM, following fallthrough
// blocks if necessary.
bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
MachineBasicBlock *Block) const {
auto BlockEnd = Block->getParent()->end();
auto BlockIter = Block->getIterator();

while (true) {
if (It.isEnd()) {
if (++BlockIter != BlockEnd) {
It = BlockIter->instr_begin();
continue;
}

return false;
}

if (!It->isMetaInstruction())
break;

It++;
}

assert(!It.isEnd());

return It->getOpcode() == AMDGPU::S_ENDPGM;
}

// Add a wait after an instruction if architecture requirements mandate one.
bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets) {
AMDGPU::Waitcnt Wait;
bool NeedsEndPGMCheck = false;

if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
!SIInstrInfo::isAtomicRet(Inst));

if (TII->isAlwaysGDS(Inst.getOpcode())) {
Wait.DsCnt = 0;
NeedsEndPGMCheck = true;
}

ScoreBrackets.simplifyWaitcnt(Wait);

auto SuccessorIt = std::next(Inst.getIterator());
bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
/*OldWaitcntInstr=*/nullptr);

if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
.addImm(0);
}

return Result;
}

void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
Expand All @@ -1994,6 +2073,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
ScoreBrackets->setPendingGDS();
} else {
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
Expand Down Expand Up @@ -2124,6 +2204,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {

StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);

if (T == DS_CNT)
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);

for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);

Expand Down Expand Up @@ -2249,13 +2332,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

updateEventWaitcntAfter(Inst, &ScoreBrackets);

if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
ScoreBrackets.simplifyWaitcnt(Wait);
Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
}
Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);

LLVM_DEBUG({
Inst.print(dbgs());
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4239,7 +4239,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
}

bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}

bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
Expand Down
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/force-wait-after-always-gds.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s

---
# GCN-LABEL: name: test_ordered_count
# GCN: bb.0
# GCN: DS_ADD_U32
# GCN: DS_SUB_U32
# GCN-NEXT: S_WAITCNT 64535
# GCN-NEXT: $vgpr3 = DS_ORDERED_COUNT
# GCN-NEXT: S_WAITCNT 64519
# GCN-NEXT: $vgpr4_vgpr5 = DS_ADD_GS_REG_RTN
# GCN-NEXT: S_WAITCNT 64519
# GCN-NEXT: S_NOP 0

name: test_ordered_count
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2

DS_ADD_U32 $vgpr1, $vgpr2, 12, -1, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
DS_SUB_U32 $vgpr1, $vgpr2, 12, 0, implicit $m0, implicit $exec :: (load store (s32), addrspace 2)
$vgpr3 = DS_ORDERED_COUNT $vgpr0, 772, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
$vgpr4_vgpr5 = DS_ADD_GS_REG_RTN $vgpr0, 32, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
S_ENDPGM 0

...
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ define amdgpu_gs void @test_add_32(i32 %arg) {
; CHECK-LABEL: test_add_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_endpgm
%unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
ret void
Expand All @@ -30,6 +32,8 @@ define amdgpu_gs void @test_add_64(i32 %arg) {
; CHECK-LABEL: test_add_64:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_endpgm
%unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
ret void
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value)
; GCN: s_mov_b32 m0, s0
; VIGFX9-NEXT: s_nop 0
; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[VALUE]] offset:4868 gds
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: [[BB]]:
; // Wait for expcnt(0) before modifying EXEC
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
define amdgpu_cs float @ds_ordered_swap_conditional(ptr addrspace(2) inreg %gds, i32 %value) {
entry:
%c = icmp ne i32 %value, 0
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ define amdgpu_gs void @test_sub_32(i32 %arg) {
; CHECK-LABEL: test_sub_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_endpgm
%unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
ret void
Expand All @@ -30,6 +32,8 @@ define amdgpu_gs void @test_sub_64(i32 %arg) {
; CHECK-LABEL: test_sub_64:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_endpgm
%unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
ret void
Expand Down