Skip to content

Commit e1094dd

Browse files
authored
[AMDGPU][DAG] Enable ganging up of memcpy loads/stores for AMDGPU (#96185)
In the SelectionDAG lowering of the memcpy intrinsic, this optimization introduces additional chains between fixed-size groups of loads and the corresponding stores. While initially introduced to ensure that wider load/store-pair instructions are generated on AArch64, this optimization also improves code generation for AMDGPU: Ganged loads are scheduled into a clause; stores only await completion of their corresponding load. The chosen value of 16 performed good in microbenchmarks, values of 8, 32, or 64 would perform similarly. The testcase updates are autogenerated by utils/update_llc_test_checks.py. See also: - PR introducing this optimization: https://reviews.llvm.org/D46477 Part of SWDEV-455845.
1 parent 4e78d3a commit e1094dd

8 files changed

+20193
-2228
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
6767
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
6868
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
6969

70+
// Enable ganging up loads and stores in the memcpy DAG lowering.
71+
MaxGluedStoresPerMemcpy = 16;
72+
7073
// Lower floating point store/load to integer store/load to reduce the number
7174
// of patterns in tablegen.
7275
setOperationAction(ISD::LOAD, MVT::f32, Promote);

llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9074,8 +9074,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
90749074
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
90759075
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
90769076
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
9077-
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
9078-
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
9077+
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
9078+
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
90799079
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
90809080
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
90819081
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
@@ -9113,9 +9113,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
91139113
; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg@abs32@lo
91149114
; GFX9-NEXT: v_writelane_b32 v40, s63, 31
91159115
; GFX9-NEXT: s_waitcnt vmcnt(2)
9116-
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
9116+
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
91179117
; GFX9-NEXT: s_waitcnt vmcnt(2)
9118-
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32
9118+
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
91199119
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
91209120
; GFX9-NEXT: v_readlane_b32 s63, v40, 31
91219121
; GFX9-NEXT: v_readlane_b32 s62, v40, 30
@@ -9167,17 +9167,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
91679167
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
91689168
; GFX10-NEXT: s_mov_b32 exec_lo, s4
91699169
; GFX10-NEXT: s_clause 0x2
9170-
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
9171-
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
9170+
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
9171+
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
91729172
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
91739173
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
91749174
; GFX10-NEXT: s_addk_i32 s32, 0x400
91759175
; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg@abs32@hi
91769176
; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg@abs32@lo
91779177
; GFX10-NEXT: s_waitcnt vmcnt(2)
9178-
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
9178+
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
91799179
; GFX10-NEXT: s_waitcnt vmcnt(1)
9180-
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
9180+
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
91819181
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
91829182
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
91839183
; GFX10-NEXT: v_writelane_b32 v40, s35, 3

llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,22 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
88
; MUBUF: ; %bb.0:
99
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
11-
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
12-
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
11+
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
12+
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
1313
; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
1414
; MUBUF-NEXT: s_waitcnt vmcnt(2)
15-
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
1615
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
16+
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
1717
; MUBUF-NEXT: s_waitcnt vmcnt(3)
18-
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28
19-
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24
20-
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
21-
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16
18+
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
19+
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
20+
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
21+
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
2222
; MUBUF-NEXT: s_waitcnt vmcnt(6)
23-
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12
24-
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8
25-
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4
26-
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32
23+
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
24+
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
25+
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
26+
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
2727
; MUBUF-NEXT: ;;#ASMSTART
2828
; MUBUF-NEXT: ; use v0
2929
; MUBUF-NEXT: ;;#ASMEND
@@ -33,16 +33,16 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
3333
; FLATSCR-LABEL: memcpy_fixed_align:
3434
; FLATSCR: ; %bb.0:
3535
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36+
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
37+
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
3638
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
37-
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
38-
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
3939
; FLATSCR-NEXT: v_mov_b32_e32 v0, s32
4040
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
41-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
41+
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
4242
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
43-
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16
43+
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
4444
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
45-
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32
45+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
4646
; FLATSCR-NEXT: ;;#ASMSTART
4747
; FLATSCR-NEXT: ; use v0
4848
; FLATSCR-NEXT: ;;#ASMEND

0 commit comments

Comments
 (0)