Skip to content

Commit 5634e7e

Browse files
authored
[AMDGCN][SIWholeQuadMode] Rework splitBlock/lowerKillI1/lowerKillF32 to handle case when SI_KILL_I1_TERMINATOR -1 0 is not the unique terminator
The lowerKillI1 method wrongly handled cases where it inserted a new S_BRANCH instruction when the kill was not the only terminator, and then tried to split the block. `SI_KILL_I1_TERMINATOR -1,0` doesn't have any effect. Instead of lowering to an unconditional branch, we remove the instruction and insert an unconditional branch only if the instruction is the last terminator. No split is needed in this case (if the last terminator has been reached, then the whole block was processed). Also stop generating an unconditional branch in splitBlock: this branch was redundant since TermMI is promoted to a terminator that fallsthrough to the next block already. Solves SWDEV-508819
1 parent 02cf97f commit 5634e7e

File tree

4 files changed

+66
-30
lines changed

4 files changed

+66
-30
lines changed

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -777,11 +777,19 @@ void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
777777
case AMDGPU::S_MOV_B64:
778778
NewOpcode = AMDGPU::S_MOV_B64_term;
779779
break;
780-
default:
780+
case AMDGPU::S_ANDN2_B32:
781+
NewOpcode = AMDGPU::S_ANDN2_B32_term;
782+
break;
783+
case AMDGPU::S_ANDN2_B64:
784+
NewOpcode = AMDGPU::S_ANDN2_B64_term;
781785
break;
786+
default:
787+
llvm_unreachable("Unexpected instruction");
782788
}
783-
if (NewOpcode)
784-
TermMI->setDesc(TII->get(NewOpcode));
789+
790+
// These terminators fallthrough to the next block, no need to add an
791+
// unconditional branch to the next block (SplitBB).
792+
TermMI->setDesc(TII->get(NewOpcode));
785793

786794
if (SplitBB != BB) {
787795
// Update dominator trees
@@ -796,12 +804,6 @@ void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
796804
MDT->applyUpdates(DTUpdates);
797805
if (PDT)
798806
PDT->applyUpdates(DTUpdates);
799-
800-
// Link blocks
801-
MachineInstr *MI =
802-
BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
803-
.addMBB(SplitBB);
804-
LIS->InsertMachineInstrInMaps(*MI);
805807
}
806808
}
807809

@@ -910,19 +912,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
910912
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
911913

912914
assert(MBB.succ_size() == 1);
913-
MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
914-
.addMBB(*MBB.succ_begin());
915915

916916
// Update live intervals
917917
LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
918918
MBB.remove(&MI);
919919

920920
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
921-
LIS->InsertMachineInstrInMaps(*ExecMaskMI);
922921
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
923-
LIS->InsertMachineInstrInMaps(*NewTerm);
922+
LIS->InsertMachineInstrInMaps(*ExecMaskMI);
924923

925-
return NewTerm;
924+
return ExecMaskMI;
926925
}
927926

928927
MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
@@ -949,17 +948,17 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
949948
.addReg(Exec);
950949
} else {
951950
// Static: kill does nothing
952-
MachineInstr *NewTerm = nullptr;
953-
if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
951+
bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
952+
if (!IsLastTerminator) {
954953
LIS->RemoveMachineInstrFromMaps(MI);
955954
} else {
956-
assert(MBB.succ_size() == 1);
957-
NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
958-
.addMBB(*MBB.succ_begin());
955+
assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
956+
MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
957+
.addMBB(*MBB.succ_begin());
959958
LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
960959
}
961960
MBB.remove(&MI);
962-
return NewTerm;
961+
return nullptr;
963962
}
964963
} else {
965964
if (!KillVal) {
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s
3+
4+
define amdgpu_ps float @kill_true(i1 %.not) {
5+
; CHECK-LABEL: kill_true:
6+
; CHECK: ; %bb.0: ; %entry
7+
; CHECK-NEXT: s_mov_b64 s[0:1], exec
8+
; CHECK-NEXT: s_wqm_b64 exec, exec
9+
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
10+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
11+
; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
12+
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
13+
; CHECK-NEXT: s_cbranch_execz .LBB0_2
14+
; CHECK-NEXT: ; %bb.1: ; %if1
15+
; CHECK-NEXT: s_mov_b32 s4, 0
16+
; CHECK-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $exec
17+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
18+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
19+
; CHECK-NEXT: flat_store_dword v[0:1], v2
20+
; CHECK-NEXT: .LBB0_2: ; %endif1
21+
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
22+
; CHECK-NEXT: s_and_b64 exec, exec, s[0:1]
23+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
24+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25+
; CHECK-NEXT: ; return to shader part epilog
26+
entry:
27+
br i1 %.not, label %endif1, label %if1
28+
29+
if1:
30+
%C = call float @llvm.amdgcn.wqm.f32(float 0.000000e+00)
31+
store float %C, ptr null, align 4
32+
br label %endif1
33+
34+
endif1:
35+
call void @llvm.amdgcn.kill(i1 true)
36+
ret float 0.000000e+00
37+
}
38+
39+
declare void @llvm.amdgcn.kill(i1)
40+
41+
declare float @llvm.amdgcn.wqm.f32(float)

llvm/test/CodeGen/AMDGPU/skip-if-dead.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,10 +1139,9 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0
11391139
; SI: ; %bb.0: ; %bb
11401140
; SI-NEXT: v_add_f32_e64 v1, s0, 1.0
11411141
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
1142-
; SI-NEXT: s_mov_b64 s[2:3], exec
11431142
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
11441143
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
1145-
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
1144+
; SI-NEXT: s_andn2_b64 exec, exec, vcc
11461145
; SI-NEXT: s_cbranch_scc0 .LBB11_6
11471146
; SI-NEXT: ; %bb.1: ; %bb
11481147
; SI-NEXT: s_andn2_b64 exec, exec, vcc
@@ -1174,11 +1173,10 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0
11741173
; GFX10-WAVE64-LABEL: phi_use_def_before_kill:
11751174
; GFX10-WAVE64: ; %bb.0: ; %bb
11761175
; GFX10-WAVE64-NEXT: v_add_f32_e64 v1, s0, 1.0
1177-
; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
11781176
; GFX10-WAVE64-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
11791177
; GFX10-WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
11801178
; GFX10-WAVE64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
1181-
; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
1179+
; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc
11821180
; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB11_6
11831181
; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb
11841182
; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc
@@ -1206,11 +1204,10 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0
12061204
; GFX10-WAVE32-LABEL: phi_use_def_before_kill:
12071205
; GFX10-WAVE32: ; %bb.0: ; %bb
12081206
; GFX10-WAVE32-NEXT: v_add_f32_e64 v1, s0, 1.0
1209-
; GFX10-WAVE32-NEXT: s_mov_b32 s2, exec_lo
12101207
; GFX10-WAVE32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0, v1
12111208
; GFX10-WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
12121209
; GFX10-WAVE32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1
1213-
; GFX10-WAVE32-NEXT: s_andn2_b32 s2, s2, vcc_lo
1210+
; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
12141211
; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB11_6
12151212
; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb
12161213
; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
@@ -1238,12 +1235,11 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0
12381235
; GFX11-LABEL: phi_use_def_before_kill:
12391236
; GFX11: ; %bb.0: ; %bb
12401237
; GFX11-NEXT: v_add_f32_e64 v1, s0, 1.0
1241-
; GFX11-NEXT: s_mov_b64 s[2:3], exec
12421238
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12431239
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
12441240
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
12451241
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
1246-
; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc
1242+
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
12471243
; GFX11-NEXT: s_cbranch_scc0 .LBB11_6
12481244
; GFX11-NEXT: ; %bb.1: ; %bb
12491245
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3361,7 +3361,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
33613361
; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
33623362
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
33633363
; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
3364-
; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
3364+
; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
33653365
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB55_1
33663366
; GFX9-W64-NEXT: s_endpgm
33673367
; GFX9-W64-NEXT: .LBB55_1:
@@ -3377,7 +3377,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
33773377
; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
33783378
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
33793379
; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
3380-
; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
3380+
; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
33813381
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB55_1
33823382
; GFX10-W32-NEXT: s_endpgm
33833383
; GFX10-W32-NEXT: .LBB55_1:

0 commit comments

Comments
 (0)