Skip to content

Commit 229e118

Browse files
authored
[AMDGPU] Codegen support for constrained multi-dword sloads (#96163)
For targets that support xnack replay feature (gfx8+), the multi-dword scalar loads shouldn't clobber any register that holds the src address. The constrained version of the scalar loads have the early clobber flag attached to the dst operand to restrict RA from re-allocating any of the src regs for its dst operand.
1 parent 7d0a584 commit 229e118

File tree

108 files changed

+7267
-6490
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+7267
-6490
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2026,6 +2026,8 @@ def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
20262026

20272027
def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
20282028

2029+
def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">;
2030+
20292031
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
20302032
AssemblerPredicate<(all_of Feature16BitInsts)>;
20312033

llvm/lib/Target/AMDGPU/SMInstructions.td

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -866,45 +866,74 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
866866
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
867867
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
868868

869-
multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
869+
class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
870+
// Returns true if it is a single dword load or naturally aligned multi-dword load.
871+
LoadSDNode *Ld = cast<LoadSDNode>(N);
872+
unsigned Size = Ld->getMemoryVT().getStoreSize();
873+
return Size <= 4 || Ld->getAlign().value() >= Size;
874+
}]> {
875+
let GISelPredicateCode = [{
876+
auto &Ld = cast<GLoad>(MI);
877+
TypeSize Size = Ld.getMMO().getSize().getValue();
878+
return Size <= 4 || Ld.getMMO().getAlign().value() >= Size;
879+
}];
880+
}
881+
882+
def aligned_smrd_load : SMRDAlignedLoadPat<smrd_load>;
870883

884+
multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
885+
bit immci = true, string suffix = ""> {
871886
// 1. IMM offset
872887
def : GCNPat <
873-
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
874-
(vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
875-
>;
888+
(frag (SMRDImm i64:$sbase, i32:$offset)),
889+
(vt (!cast<SM_Pseudo>(Instr#"_IMM"#suffix) $sbase, $offset, 0))>;
876890

877891
// 2. 32-bit IMM offset on CI
878892
if immci then def : GCNPat <
879-
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
880-
(vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
881-
let OtherPredicates = [isGFX7Only];
893+
(frag (SMRDImm32 i64:$sbase, i32:$offset)),
894+
(vt (!cast<InstSI>(Instr#"_IMM_ci"#suffix) $sbase, $offset, 0))> {
895+
let SubtargetPredicate = isGFX7Only;
882896
}
883897

884898
// 3. SGPR offset
885899
def : GCNPat <
886-
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
887-
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
888-
let OtherPredicates = [isNotGFX9Plus];
900+
(frag (SMRDSgpr i64:$sbase, i32:$soffset)),
901+
(vt (!cast<SM_Pseudo>(Instr#"_SGPR"#suffix) $sbase, $soffset, 0))> {
902+
let SubtargetPredicate = isNotGFX9Plus;
889903
}
890904
def : GCNPat <
891-
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
892-
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
893-
let OtherPredicates = [isGFX9Plus];
905+
(frag (SMRDSgpr i64:$sbase, i32:$soffset)),
906+
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
907+
let SubtargetPredicate = isGFX9Plus;
894908
}
895909

896910
// 4. SGPR+IMM offset
897911
def : GCNPat <
898-
(smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
899-
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
900-
let OtherPredicates = [isGFX9Plus];
912+
(frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
913+
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
914+
let SubtargetPredicate = isGFX9Plus;
901915
}
902916

903917
// 5. No offset
904918
def : GCNPat <
905-
(vt (smrd_load (i64 SReg_64:$sbase))),
906-
(vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
907-
>;
919+
(vt (frag (i64 SReg_64:$sbase))),
920+
(vt (!cast<SM_Pseudo>(Instr#"_IMM"#suffix) i64:$sbase, 0, 0))>;
921+
}
922+
923+
multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
924+
// High priority when XNACK is enabled and the load was naturally aligned.
925+
let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 102 in
926+
defm: SMRD_Patterns <Instr, vt, aligned_smrd_load, immci>;
927+
928+
// XNACK is enabled and the load wasn't naturally aligned. The constrained sload variant.
929+
if !gt(vt.Size, 32) then {
930+
let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 101 in
931+
defm: SMRD_Patterns <Instr, vt, smrd_load, /*immci=*/false, /*suffix=*/"_ec">;
932+
}
933+
934+
// XNACK is disabled.
935+
let AddedComplexity = 100 in
936+
defm: SMRD_Patterns <Instr, vt, smrd_load, immci>;
908937
}
909938

910939
multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
@@ -1018,6 +1047,8 @@ defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ubyte, "S_BUFFER_LOAD_U8">;
10181047
defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_short, "S_BUFFER_LOAD_I16">;
10191048
defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ushort, "S_BUFFER_LOAD_U16">;
10201049

1050+
} // End let AddedComplexity = 100
1051+
10211052
foreach vt = Reg32Types.types in {
10221053
defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
10231054
}
@@ -1042,7 +1073,6 @@ foreach vt = SReg_512.RegTypes in {
10421073
defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
10431074
}
10441075

1045-
} // End let AddedComplexity = 100
10461076

10471077
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
10481078
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,20 +1021,20 @@ main_body:
10211021
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
10221022
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
10231023
; GFX90A: ; %bb.0: ; %main_body
1024-
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1024+
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
10251025
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
10261026
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1027-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1028-
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1027+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1028+
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
10291029
; GFX90A-NEXT: s_endpgm
10301030
;
10311031
; GFX940-LABEL: global_atomic_fadd_f64_noret:
10321032
; GFX940: ; %bb.0: ; %main_body
1033-
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1033+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
10341034
; GFX940-NEXT: v_mov_b32_e32 v2, 0
10351035
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1036-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1037-
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1036+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
1037+
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
10381038
; GFX940-NEXT: s_endpgm
10391039
main_body:
10401040
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
10441044
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
10451045
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
10461046
; GFX90A: ; %bb.0: ; %main_body
1047-
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1047+
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
10481048
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
10491049
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1050-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1051-
; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
1050+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1051+
; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
10521052
; GFX90A-NEXT: s_endpgm
10531053
;
10541054
; GFX940-LABEL: global_atomic_fmin_f64_noret:
10551055
; GFX940: ; %bb.0: ; %main_body
1056-
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1056+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
10571057
; GFX940-NEXT: v_mov_b32_e32 v2, 0
10581058
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1059-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1060-
; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
1059+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
1060+
; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
10611061
; GFX940-NEXT: s_endpgm
10621062
main_body:
10631063
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
10671067
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
10681068
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
10691069
; GFX90A: ; %bb.0: ; %main_body
1070-
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1070+
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
10711071
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
10721072
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1073-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1074-
; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
1073+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1074+
; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
10751075
; GFX90A-NEXT: s_endpgm
10761076
;
10771077
; GFX940-LABEL: global_atomic_fmax_f64_noret:
10781078
; GFX940: ; %bb.0: ; %main_body
1079-
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1079+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
10801080
; GFX940-NEXT: v_mov_b32_e32 v2, 0
10811081
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1082-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1083-
; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
1082+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
1083+
; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
10841084
; GFX940-NEXT: s_endpgm
10851085
main_body:
10861086
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
11341134
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
11351135
; GFX940-NEXT: s_cbranch_execz .LBB39_2
11361136
; GFX940-NEXT: ; %bb.1:
1137-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1137+
; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
11381138
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
11391139
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
11401140
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
11411141
; GFX940-NEXT: v_mov_b32_e32 v2, 0
11421142
; GFX940-NEXT: buffer_wbl2 sc0 sc1
11431143
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1144-
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
1144+
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
11451145
; GFX940-NEXT: s_waitcnt vmcnt(0)
11461146
; GFX940-NEXT: buffer_inv sc0 sc1
11471147
; GFX940-NEXT: .LBB39_2:
@@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
11621162
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
11631163
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
11641164
; GFX90A-NEXT: ; %bb.1:
1165-
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1165+
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
11661166
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
11671167
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
11681168
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
11691169
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
11701170
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1171-
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
1171+
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
11721172
; GFX90A-NEXT: s_waitcnt vmcnt(0)
11731173
; GFX90A-NEXT: buffer_wbinvl1_vol
11741174
; GFX90A-NEXT: .LBB40_2:
@@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
11841184
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
11851185
; GFX940-NEXT: s_cbranch_execz .LBB40_2
11861186
; GFX940-NEXT: ; %bb.1:
1187-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1187+
; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
11881188
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
11891189
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
11901190
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
11911191
; GFX940-NEXT: v_mov_b32_e32 v2, 0
11921192
; GFX940-NEXT: buffer_wbl2 sc1
11931193
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1194-
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
1194+
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
11951195
; GFX940-NEXT: s_waitcnt vmcnt(0)
11961196
; GFX940-NEXT: buffer_inv sc1
11971197
; GFX940-NEXT: .LBB40_2:
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
12481248
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
12491249
; GFX940-NEXT: s_cbranch_execz .LBB41_2
12501250
; GFX940-NEXT: ; %bb.1:
1251-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1251+
; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
12521252
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
12531253
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
12541254
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
12551255
; GFX940-NEXT: v_mov_b32_e32 v2, 0
12561256
; GFX940-NEXT: buffer_wbl2 sc0 sc1
12571257
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1258-
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
1258+
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
12591259
; GFX940-NEXT: s_waitcnt vmcnt(0)
12601260
; GFX940-NEXT: buffer_inv sc0 sc1
12611261
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
12761276
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
12771277
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
12781278
; GFX90A-NEXT: ; %bb.1:
1279-
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1279+
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
12801280
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
12811281
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
12821282
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
12831283
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
12841284
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1285-
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
1285+
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
12861286
; GFX90A-NEXT: s_waitcnt vmcnt(0)
12871287
; GFX90A-NEXT: buffer_wbinvl1_vol
12881288
; GFX90A-NEXT: .LBB42_2:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
12981298
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
12991299
; GFX940-NEXT: s_cbranch_execz .LBB42_2
13001300
; GFX940-NEXT: ; %bb.1:
1301-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1301+
; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
13021302
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
13031303
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
13041304
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
13051305
; GFX940-NEXT: v_mov_b32_e32 v2, 0
13061306
; GFX940-NEXT: buffer_wbl2 sc1
13071307
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1308-
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
1308+
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
13091309
; GFX940-NEXT: s_waitcnt vmcnt(0)
13101310
; GFX940-NEXT: buffer_inv sc1
13111311
; GFX940-NEXT: .LBB42_2:
@@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
15221522
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
15231523
; GFX940-NEXT: s_cbranch_execz .LBB49_2
15241524
; GFX940-NEXT: ; %bb.1:
1525-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1525+
; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
15261526
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
15271527
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
15281528
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
15291529
; GFX940-NEXT: v_mov_b32_e32 v2, 0
15301530
; GFX940-NEXT: buffer_wbl2 sc1
15311531
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1532-
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
1532+
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
15331533
; GFX940-NEXT: s_waitcnt vmcnt(0)
15341534
; GFX940-NEXT: buffer_inv sc1
15351535
; GFX940-NEXT: .LBB49_2:
@@ -1761,19 +1761,19 @@ main_body:
17611761
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
17621762
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
17631763
; GFX90A: ; %bb.0: ; %main_body
1764-
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1764+
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
17651765
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1766-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1767-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1766+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1767+
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
17681768
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
17691769
; GFX90A-NEXT: s_endpgm
17701770
;
17711771
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
17721772
; GFX940: ; %bb.0: ; %main_body
1773-
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1773+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
17741774
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1775-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1776-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1775+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1776+
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
17771777
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
17781778
; GFX940-NEXT: s_endpgm
17791779
main_body:
@@ -1842,19 +1842,19 @@ main_body:
18421842
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
18431843
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
18441844
; GFX90A: ; %bb.0: ; %main_body
1845-
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1845+
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
18461846
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1847-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1848-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1847+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1848+
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
18491849
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
18501850
; GFX90A-NEXT: s_endpgm
18511851
;
18521852
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
18531853
; GFX940: ; %bb.0: ; %main_body
1854-
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1854+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
18551855
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1856-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1857-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1856+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1857+
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
18581858
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
18591859
; GFX940-NEXT: s_endpgm
18601860
main_body:
@@ -1884,19 +1884,19 @@ main_body:
18841884
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
18851885
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
18861886
; GFX90A: ; %bb.0: ; %main_body
1887-
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1887+
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
18881888
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1889-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1890-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
1889+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1890+
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
18911891
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
18921892
; GFX90A-NEXT: s_endpgm
18931893
;
18941894
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
18951895
; GFX940: ; %bb.0: ; %main_body
1896-
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1896+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
18971897
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1898-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1899-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1898+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1899+
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
19001900
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
19011901
; GFX940-NEXT: s_endpgm
19021902
main_body:

0 commit comments

Comments
 (0)