Skip to content

Commit e7e6cbc

Browse files
committed
Used byte width and simplified some more code.
1 parent e76cf18 commit e7e6cbc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1920
-1916
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,11 +1234,8 @@ void SILoadStoreOptimizer::copyToDestRegs(
12341234
// The constrained sload instructions in S_LOAD_IMM class will have
12351235
// `early-clobber` flag in the dst operand. Remove the flag before using the
12361236
// MOs in copies.
1237-
if (Dest0->isEarlyClobber())
1238-
Dest0->setIsEarlyClobber(false);
1239-
1240-
if (Dest1->isEarlyClobber())
1241-
Dest1->setIsEarlyClobber(false);
1237+
Dest0->setIsEarlyClobber(false);
1238+
Dest1->setIsEarlyClobber(false);
12421239

12431240
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
12441241
.add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1729,24 +1726,23 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
17291726
// If XNACK is enabled, use the constrained opcodes when the first load is
17301727
// under-aligned.
17311728
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1732-
auto NeedsConstrainedOpc = [&MMO, Width](const GCNSubtarget &ST) {
1733-
return ST.isXNACKEnabled() && MMO->getAlign().value() < Width;
1734-
};
1729+
bool NeedsConstrainedOpc =
1730+
STM->isXNACKEnabled() && MMO->getAlign().value() < (Width << 2);
17351731
switch (Width) {
17361732
default:
17371733
return 0;
17381734
case 2:
1739-
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1740-
: AMDGPU::S_LOAD_DWORDX2_IMM;
1735+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736+
: AMDGPU::S_LOAD_DWORDX2_IMM;
17411737
case 3:
1742-
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1743-
: AMDGPU::S_LOAD_DWORDX3_IMM;
1738+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739+
: AMDGPU::S_LOAD_DWORDX3_IMM;
17441740
case 4:
1745-
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1746-
: AMDGPU::S_LOAD_DWORDX4_IMM;
1741+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742+
: AMDGPU::S_LOAD_DWORDX4_IMM;
17471743
case 8:
1748-
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1749-
: AMDGPU::S_LOAD_DWORDX8_IMM;
1744+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745+
: AMDGPU::S_LOAD_DWORDX8_IMM;
17501746
}
17511747
}
17521748
case GLOBAL_LOAD:

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
159159
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
160160
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
161161
; GFX940: ; %bb.0:
162-
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
162+
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
163163
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
164-
; GFX940-NEXT: v_mov_b32_e32 v0, s0
165-
; GFX940-NEXT: v_mov_b32_e32 v1, s1
164+
; GFX940-NEXT: v_mov_b32_e32 v0, s2
165+
; GFX940-NEXT: v_mov_b32_e32 v1, s3
166166
; GFX940-NEXT: ds_pk_add_f16 v0, v1
167167
; GFX940-NEXT: s_endpgm
168168
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
183183
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
184184
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
185185
; GFX940: ; %bb.0:
186-
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
186+
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
187187
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
188-
; GFX940-NEXT: v_mov_b32_e32 v0, s1
189-
; GFX940-NEXT: v_mov_b32_e32 v1, s0
188+
; GFX940-NEXT: v_mov_b32_e32 v0, s3
189+
; GFX940-NEXT: v_mov_b32_e32 v1, s2
190190
; GFX940-NEXT: buffer_wbl2 sc0 sc1
191191
; GFX940-NEXT: ds_pk_add_bf16 v1, v0
192192
; GFX940-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
1919
;
2020
; GFX10-LABEL: dpp_test:
2121
; GFX10: ; %bb.0:
22-
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
22+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2323
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
24-
; GFX10-NEXT: v_mov_b32_e32 v0, s2
25-
; GFX10-NEXT: v_mov_b32_e32 v1, s3
24+
; GFX10-NEXT: v_mov_b32_e32 v0, s6
25+
; GFX10-NEXT: v_mov_b32_e32 v1, s7
2626
; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
2727
; GFX10-NEXT: v_mov_b32_e32 v1, 0
28-
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
28+
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2929
; GFX10-NEXT: s_endpgm
3030
;
3131
; GFX11-LABEL: dpp_test:
@@ -174,16 +174,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
174174
;
175175
; GFX10-LABEL: update_dppv2i32_test:
176176
; GFX10: ; %bb.0:
177-
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
177+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
178178
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
179179
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
180-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
181-
; GFX10-NEXT: v_mov_b32_e32 v2, s2
182-
; GFX10-NEXT: v_mov_b32_e32 v3, s3
180+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
181+
; GFX10-NEXT: v_mov_b32_e32 v2, s6
182+
; GFX10-NEXT: v_mov_b32_e32 v3, s7
183183
; GFX10-NEXT: s_waitcnt vmcnt(0)
184184
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
185185
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
186-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
186+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
187187
; GFX10-NEXT: s_endpgm
188188
;
189189
; GFX11-LABEL: update_dppv2i32_test:
@@ -229,16 +229,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
229229
;
230230
; GFX10-LABEL: update_dppv2f32_test:
231231
; GFX10: ; %bb.0:
232-
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
232+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
233233
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
234234
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
235-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
236-
; GFX10-NEXT: v_mov_b32_e32 v2, s2
237-
; GFX10-NEXT: v_mov_b32_e32 v3, s3
235+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
236+
; GFX10-NEXT: v_mov_b32_e32 v2, s6
237+
; GFX10-NEXT: v_mov_b32_e32 v3, s7
238238
; GFX10-NEXT: s_waitcnt vmcnt(0)
239239
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
240240
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
241-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
241+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
242242
; GFX10-NEXT: s_endpgm
243243
;
244244
; GFX11-LABEL: update_dppv2f32_test:

llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Lines changed: 76 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -692,148 +692,148 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
692692
;
693693
; GFX9-LABEL: sdivrem_v2i32:
694694
; GFX9: ; %bb.0:
695-
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
695+
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
696696
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
697-
; GFX9-NEXT: s_ashr_i32 s8, s6, 31
698-
; GFX9-NEXT: s_add_i32 s6, s6, s8
699-
; GFX9-NEXT: s_xor_b32 s6, s6, s8
700-
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
701-
; GFX9-NEXT: s_ashr_i32 s9, s7, 31
702-
; GFX9-NEXT: s_add_i32 s7, s7, s9
703-
; GFX9-NEXT: s_xor_b32 s7, s7, s9
697+
; GFX9-NEXT: s_ashr_i32 s0, s14, 31
698+
; GFX9-NEXT: s_add_i32 s1, s14, s0
699+
; GFX9-NEXT: s_xor_b32 s1, s1, s0
700+
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
701+
; GFX9-NEXT: s_ashr_i32 s2, s15, 31
702+
; GFX9-NEXT: s_add_i32 s3, s15, s2
703+
; GFX9-NEXT: s_xor_b32 s3, s3, s2
704704
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
705-
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
706-
; GFX9-NEXT: s_sub_i32 s12, 0, s6
707-
; GFX9-NEXT: s_ashr_i32 s10, s4, 31
705+
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
706+
; GFX9-NEXT: s_sub_i32 s6, 0, s1
707+
; GFX9-NEXT: s_ashr_i32 s4, s12, 31
708708
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
709709
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
710710
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
711-
; GFX9-NEXT: s_add_i32 s4, s4, s10
712-
; GFX9-NEXT: s_xor_b32 s4, s4, s10
713-
; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0
711+
; GFX9-NEXT: s_sub_i32 s7, 0, s3
712+
; GFX9-NEXT: s_ashr_i32 s5, s13, 31
713+
; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0
714714
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
715715
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
716-
; GFX9-NEXT: s_sub_i32 s12, 0, s7
716+
; GFX9-NEXT: s_add_i32 s6, s12, s4
717717
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
718-
; GFX9-NEXT: s_ashr_i32 s11, s5, 31
719-
; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
720-
; GFX9-NEXT: s_add_i32 s5, s5, s11
718+
; GFX9-NEXT: s_xor_b32 s6, s6, s4
719+
; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1
720+
; GFX9-NEXT: s_add_i32 s7, s13, s5
721721
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
722-
; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
722+
; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0
723723
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
724-
; GFX9-NEXT: s_xor_b32 s5, s5, s11
725-
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
724+
; GFX9-NEXT: s_xor_b32 s7, s7, s5
725+
; GFX9-NEXT: s_xor_b32 s0, s4, s0
726+
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1
726727
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
727728
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
728-
; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1
729-
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
730-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
729+
; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1
730+
; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3
731+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3
731732
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
732-
; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
733+
; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3
733734
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
734735
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
735-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
736+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2
736737
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
737-
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2
738+
; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2
738739
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
739-
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
740+
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
740741
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
741-
; GFX9-NEXT: s_xor_b32 s4, s10, s8
742-
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
743-
; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3
744-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
742+
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
743+
; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0
744+
; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3
745+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
745746
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
746-
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
747+
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
747748
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
748749
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
749-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
750-
; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
750+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
751751
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
752-
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
753-
; GFX9-NEXT: s_xor_b32 s4, s11, s9
752+
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
753+
; GFX9-NEXT: s_xor_b32 s0, s5, s2
754754
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
755-
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
756-
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
757-
; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1
758-
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
755+
; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1
756+
; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2
757+
; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1
758+
; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3
759759
; GFX9-NEXT: v_mov_b32_e32 v4, 0
760-
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
761-
; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3
762-
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
763-
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
760+
; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2
761+
; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3
762+
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
763+
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
764764
; GFX9-NEXT: s_endpgm
765765
;
766766
; GFX10-LABEL: sdivrem_v2i32:
767767
; GFX10: ; %bb.0:
768-
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
768+
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
769769
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
770-
; GFX10-NEXT: s_ashr_i32 s1, s10, 31
771-
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
772-
; GFX10-NEXT: s_add_i32 s0, s10, s1
773-
; GFX10-NEXT: s_add_i32 s3, s11, s2
774-
; GFX10-NEXT: s_xor_b32 s10, s0, s1
770+
; GFX10-NEXT: s_ashr_i32 s1, s14, 31
771+
; GFX10-NEXT: s_ashr_i32 s2, s15, 31
772+
; GFX10-NEXT: s_add_i32 s0, s14, s1
773+
; GFX10-NEXT: s_add_i32 s3, s15, s2
774+
; GFX10-NEXT: s_xor_b32 s4, s0, s1
775775
; GFX10-NEXT: s_xor_b32 s3, s3, s2
776-
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
776+
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
777777
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
778-
; GFX10-NEXT: s_sub_i32 s0, 0, s10
779-
; GFX10-NEXT: s_sub_i32 s11, 0, s3
780-
; GFX10-NEXT: s_ashr_i32 s12, s9, 31
778+
; GFX10-NEXT: s_sub_i32 s0, 0, s4
779+
; GFX10-NEXT: s_sub_i32 s5, 0, s3
780+
; GFX10-NEXT: s_ashr_i32 s6, s13, 31
781781
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
782782
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
783+
; GFX10-NEXT: s_add_i32 s7, s13, s6
784+
; GFX10-NEXT: s_xor_b32 s7, s7, s6
783785
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
784786
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
785787
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
786788
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
787789
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
788-
; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
789-
; GFX10-NEXT: s_ashr_i32 s11, s8, 31
790-
; GFX10-NEXT: s_add_i32 s0, s8, s11
791-
; GFX10-NEXT: s_add_i32 s8, s9, s12
792-
; GFX10-NEXT: s_xor_b32 s0, s0, s11
793-
; GFX10-NEXT: s_xor_b32 s8, s8, s12
790+
; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1
791+
; GFX10-NEXT: s_ashr_i32 s5, s12, 31
792+
; GFX10-NEXT: s_add_i32 s0, s12, s5
793+
; GFX10-NEXT: s_xor_b32 s1, s5, s1
794+
; GFX10-NEXT: s_xor_b32 s0, s0, s5
794795
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
795796
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
796-
; GFX10-NEXT: s_xor_b32 s1, s11, s1
797797
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
798798
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
799799
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
800-
; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1
801-
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
800+
; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1
801+
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4
802802
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
803803
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
804804
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
805805
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
806-
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3
807-
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
806+
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3
807+
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
808808
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
809-
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
809+
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
810810
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
811811
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
812812
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
813813
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
814814
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
815815
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
816816
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
817-
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
817+
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
818818
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
819-
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
819+
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
820820
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
821821
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
822822
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
823823
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
824824
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
825-
; GFX10-NEXT: s_xor_b32 s0, s12, s2
825+
; GFX10-NEXT: s_xor_b32 s0, s6, s2
826826
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
827827
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
828-
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
829-
; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
828+
; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2
829+
; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3
830830
; GFX10-NEXT: v_mov_b32_e32 v4, 0
831831
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
832832
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
833-
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
834-
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
835-
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
836-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
833+
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2
834+
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3
835+
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
836+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
837837
; GFX10-NEXT: s_endpgm
838838
%div = sdiv <2 x i32> %x, %y
839839
store <2 x i32> %div, ptr addrspace(1) %out0

0 commit comments

Comments
 (0)