-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Codegen support for constrained multi-dword sloads #96163
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Codegen support for constrained multi-dword sloads #96163
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesFor targets that support xnack replay feature (gfx8+), the Patch is 7.42 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96163.diff 265 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4551a3a615b15..9fbedce554a53 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -867,13 +867,104 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
+class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is a naturally aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size <= 4) || (Ld->getAlign().value() >= PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size <= 4) || (Ld.getMMO().getAlign().value() >= PowerOf2Ceil(Size));
+ }];
+}
+
+class SMRDUnalignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is an under aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size > 4) && (Ld->getAlign().value() < PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size > 4) && (Ld.getMMO().getAlign().value() < PowerOf2Ceil(Size));
+ }];
+}
+
+def alignedmultidwordload : SMRDAlignedLoadPat<smrd_load>;
+def unalignedmultidwordload : SMRDUnalignedLoadPat<smrd_load>;
+
+multiclass SMRD_Align_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+
+ // 2. SGPR offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_ec") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 3. SGPR+IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (alignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (vt (unalignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+}
+
multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
- >;
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
+ }
// 2. 32-bit IMM offset on CI
if immci then def : GCNPat <
@@ -886,26 +977,17 @@ multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
- let OtherPredicates = [isNotGFX9Plus];
- }
- def : GCNPat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ let OtherPredicates = [isGFX6GFX7];
}
- // 4. SGPR+IMM offset
+ // 4. No offset
def : GCNPat <
- (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ (vt (smrd_load (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
}
- // 5. No offset
- def : GCNPat <
- (vt (smrd_load (i64 SReg_64:$sbase))),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
- >;
+ defm : SMRD_Align_Pattern<Instr, vt>;
}
multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index a38b6e3263882..9a8672dba5357 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -7,11 +7,11 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -23,10 +23,10 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -59,11 +59,11 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -75,10 +75,10 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index bb5ccc3657dc4..57a8bbbb7d185 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -113,9 +113,9 @@ bb1:
define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
; WAVE64-LABEL: brcond_sgpr_trunc_and:
; WAVE64: ; %bb.0: ; %entry
-; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE64-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE64-NEXT: s_and_b32 s0, s0, s1
+; WAVE64-NEXT: s_and_b32 s0, s2, s3
; WAVE64-NEXT: s_xor_b32 s0, s0, 1
; WAVE64-NEXT: s_and_b32 s0, s0, 1
; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
@@ -131,9 +131,9 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
;
; WAVE32-LABEL: brcond_sgpr_trunc_and:
; WAVE32: ; %bb.0: ; %entry
-; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE32-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE32-NEXT: s_and_b32 s0, s0, s1
+; WAVE32-NEXT: s_and_b32 s0, s2, s3
; WAVE32-NEXT: s_xor_b32 s0, s0, 1
; WAVE32-NEXT: s_and_b32 s0, s0, 1
; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 3f034eaca4997..9cabe0c0ae9de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1400,11 +1400,11 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
@@ -1412,8 +1412,8 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_add_f32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index a018ea5bf18f1..ce0d9c3c5365e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -27,10 +27,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -43,10 +43,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 4e94a646f6da5..081e25708c067 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1021,20 +1021,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1090,21 +1090,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1112,14 +1112,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; G...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In a separate patch, we should add a verifier check that you used the correct tied version depending on whether xnack is enabled or not
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another strategy that might simplify the patterns is to always select the _ec versions, and then later swap to the non-ec versions if xnack is disabled
65eb443
to
26e0864
Compare
f79b902
to
65ccf6d
Compare
786a670
to
e7e6cbc
Compare
77efd76
to
c31f853
Compare
Ping |
Ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
The latest patch optimizes the PatFrag and the patterns written further by using OtherPredicates. The lit test changes in the latest patch are a missed optimization I incorrectly introduced earlier in this PR for GFX7. It is now fixed and matches the default behavior with the current compiler. |
a1b2665
to
a8394e2
Compare
07b01b6
to
c73011b
Compare
b004bd6
to
4f5de35
Compare
c73011b
to
21f3849
Compare
Merge activity
|
9ab38a2
to
276fb59
Compare
21f3849
to
ca9ae49
Compare
For targets that support xnack replay feature (gfx8+), the multi-dword scalar loads shouldn't clobber any register that holds the src address. The constraint version of the scalar loads have the early clobber flag attached to the dst operand to restrict RA from re-allocating any of the src regs for its dst operand.
ca9ae49
to
b006c30
Compare
Summary: For targets that support xnack replay feature (gfx8+), the multi-dword scalar loads shouldn't clobber any register that holds the src address. The constrained version of the scalar loads have the early clobber flag attached to the dst operand to restrict RA from re-allocating any of the src regs for its dst operand. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251360
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constraint version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.