Skip to content

Commit df6750e

Browse files
authored
[AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec (#93680)
Whole quad mode requires inserting a copy of the initial EXEC mask. In a function that also uses llvm.amdgcn.init.exec, insert the COPY after initializing EXEC.
1 parent 5a0181f commit df6750e

File tree

2 files changed

+66
-9
lines changed

2 files changed

+66
-9
lines changed

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
225225
void lowerCopyInstrs();
226226
void lowerKillInstrs(bool IsWQM);
227227
void lowerInitExec(MachineInstr &MI);
228-
void lowerInitExecInstrs();
228+
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
229229

230230
public:
231231
static char ID;
@@ -1648,9 +1648,23 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
16481648
LIS->createAndComputeVirtRegInterval(CountReg);
16491649
}
16501650

1651-
void SIWholeQuadMode::lowerInitExecInstrs() {
1652-
for (MachineInstr *MI : InitExecInstrs)
1651+
/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1652+
/// for instructions that depend on EXEC.
1653+
MachineBasicBlock::iterator
1654+
SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
1655+
MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1656+
1657+
for (MachineInstr *MI : InitExecInstrs) {
1658+
// Try to handle undefined cases gracefully:
1659+
// - multiple INIT_EXEC instructions
1660+
// - INIT_EXEC instructions not in the entry block
1661+
if (MI->getParent() == &Entry)
1662+
InsertPt = std::next(MI->getIterator());
1663+
16531664
lowerInitExec(*MI);
1665+
}
1666+
1667+
return InsertPt;
16541668
}
16551669

16561670
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
@@ -1701,19 +1715,16 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
17011715

17021716
LiveMaskReg = Exec;
17031717

1718+
MachineBasicBlock &Entry = MF.front();
1719+
MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
1720+
17041721
// Shader is simple does not need any state changes or any complex lowering
17051722
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
17061723
LowerToMovInstrs.empty() && KillInstrs.empty()) {
1707-
lowerInitExecInstrs();
17081724
lowerLiveMaskQueries();
17091725
return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
17101726
}
17111727

1712-
lowerInitExecInstrs();
1713-
1714-
MachineBasicBlock &Entry = MF.front();
1715-
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1716-
17171728
// Store a copy of the original live mask when required
17181729
if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
17191730
LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3395,6 +3395,52 @@ main_body:
33953395
ret void
33963396
}
33973397

3398+
; Test the interaction between wqm and llvm.amdgcn.init.exec.
3399+
define amdgpu_gs void @wqm_init_exec() {
3400+
; GFX9-W64-LABEL: wqm_init_exec:
3401+
; GFX9-W64: ; %bb.0: ; %bb
3402+
; GFX9-W64-NEXT: s_mov_b64 exec, -1
3403+
; GFX9-W64-NEXT: s_mov_b32 s0, 0
3404+
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
3405+
; GFX9-W64-NEXT: s_mov_b32 s1, s0
3406+
; GFX9-W64-NEXT: s_mov_b32 s2, s0
3407+
; GFX9-W64-NEXT: s_mov_b32 s3, s0
3408+
; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0
3409+
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
3410+
; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0
3411+
; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3412+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3413+
; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec
3414+
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
3415+
; GFX9-W64-NEXT: ds_write_b32 v0, v1
3416+
; GFX9-W64-NEXT: s_endpgm
3417+
;
3418+
; GFX10-W32-LABEL: wqm_init_exec:
3419+
; GFX10-W32: ; %bb.0: ; %bb
3420+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1
3421+
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
3422+
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
3423+
; GFX10-W32-NEXT: s_mov_b32 s0, 0
3424+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3425+
; GFX10-W32-NEXT: s_mov_b32 s2, s0
3426+
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
3427+
; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0
3428+
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
3429+
; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0
3430+
; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0
3431+
; GFX10-W32-NEXT: s_mov_b32 s1, s0
3432+
; GFX10-W32-NEXT: s_mov_b32 s3, s0
3433+
; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3434+
; GFX10-W32-NEXT: ds_write_b32 v0, v4
3435+
; GFX10-W32-NEXT: s_endpgm
3436+
bb:
3437+
call void @llvm.amdgcn.init.exec(i64 -1)
3438+
call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
3439+
%i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
3440+
store i32 %i, i32 addrspace(3)* null, align 4
3441+
ret void
3442+
}
3443+
33983444
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
33993445
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
34003446

0 commit comments

Comments
 (0)