Skip to content

Commit b3995aa

Browse files
authored
[AMDGPU] Decrease default NSA threshold from 3 to 2 (llvm#116624)
In graphics shaders it is better overall to use NSA encoding for IMAGE instructions, because the benefit of less constrained register allocation outweighs the cost of larger encoding. In particular NSA form often avoids the need for extra V_MOV_B32 instructions between IMAGE instructions, which can allow the IMAGE instructions to be claused. Note that in GFX12 there is no longer a bit in the encoding to choose between NSA and non-NSA forms, so this only affects GFX10 and GFX11.
1 parent 21fc36b commit b3995aa

21 files changed

+230
-301
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
5454
static cl::opt<unsigned>
5555
NSAThreshold("amdgpu-nsa-threshold",
5656
cl::desc("Number of addresses from which to enable MIMG NSA."),
57-
cl::init(3), cl::Hidden);
57+
cl::init(2), cl::Hidden);
5858

5959
GCNSubtarget::~GCNSubtarget() = default;
6060

llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,42 +31,39 @@ define void @main(<19 x i32> %arg) {
3131
; GFX10-LABEL: main:
3232
; GFX10: ; %bb.0: ; %bb
3333
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; GFX10-NEXT: s_mov_b32 s4, 0
3534
; GFX10-NEXT: v_mov_b32_e32 v1, 0
3635
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
37-
; GFX10-NEXT: s_mov_b32 s10, s4
38-
; GFX10-NEXT: s_mov_b32 s11, s4
39-
; GFX10-NEXT: v_mov_b32_e32 v4, s10
36+
; GFX10-NEXT: s_mov_b32 s4, 0
37+
; GFX10-NEXT: s_mov_b32 s5, s4
4038
; GFX10-NEXT: v_mov_b32_e32 v2, v1
4139
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
4240
; GFX10-NEXT: v_mov_b32_e32 v3, v1
43-
; GFX10-NEXT: v_mov_b32_e32 v5, s11
44-
; GFX10-NEXT: s_mov_b32 s5, s4
4541
; GFX10-NEXT: s_mov_b32 s6, s4
4642
; GFX10-NEXT: s_mov_b32 s7, s4
4743
; GFX10-NEXT: s_mov_b32 s8, s4
4844
; GFX10-NEXT: s_mov_b32 s9, s4
49-
; GFX10-NEXT: image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
45+
; GFX10-NEXT: s_mov_b32 s10, s4
46+
; GFX10-NEXT: s_mov_b32 s11, s4
47+
; GFX10-NEXT: image_store v[0:3], [v1, v1], s[4:11] dim:SQ_RSRC_IMG_2D unorm
5048
; GFX10-NEXT: s_setpc_b64 s[30:31]
5149
;
5250
; GFX11-LABEL: main:
5351
; GFX11: ; %bb.0: ; %bb
5452
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55-
; GFX11-NEXT: s_mov_b32 s0, 0
53+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
5654
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
57-
; GFX11-NEXT: s_mov_b32 s6, s0
58-
; GFX11-NEXT: s_mov_b32 s7, s0
59-
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
60-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
61-
; GFX11-NEXT: v_mov_b32_e32 v5, s7
55+
; GFX11-NEXT: s_mov_b32 s0, 0
6256
; GFX11-NEXT: s_mov_b32 s1, s0
6357
; GFX11-NEXT: v_mov_b32_e32 v2, v1
58+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
6459
; GFX11-NEXT: v_mov_b32_e32 v3, v1
6560
; GFX11-NEXT: s_mov_b32 s2, s0
6661
; GFX11-NEXT: s_mov_b32 s3, s0
6762
; GFX11-NEXT: s_mov_b32 s4, s0
6863
; GFX11-NEXT: s_mov_b32 s5, s0
69-
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
64+
; GFX11-NEXT: s_mov_b32 s6, s0
65+
; GFX11-NEXT: s_mov_b32 s7, s0
66+
; GFX11-NEXT: image_store v[0:3], [v1, v1], s[0:7] dim:SQ_RSRC_IMG_2D unorm
7067
; GFX11-NEXT: s_setpc_b64 s[30:31]
7168
bb:
7269
%i = bitcast <19 x i32> %arg to <38 x i16>

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,11 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
5555
; CHECK-NEXT: v_writelane_b32 v16, s5, 1
5656
; CHECK-NEXT: v_writelane_b32 v16, s6, 2
5757
; CHECK-NEXT: v_writelane_b32 v16, s7, 3
58-
; CHECK-NEXT: s_mov_b32 s6, 0
59-
; CHECK-NEXT: s_mov_b32 s4, s6
60-
; CHECK-NEXT: s_mov_b32 s5, s6
58+
; CHECK-NEXT: s_mov_b32 s4, 0
59+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
60+
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
6161
; CHECK-NEXT: v_mov_b32_e32 v0, s4
62-
; CHECK-NEXT: v_mov_b32_e32 v1, s5
6362
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
64-
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
6563
; CHECK-NEXT: s_mov_b32 s4, exec_lo
6664
; CHECK-NEXT: v_writelane_b32 v16, s4, 4
6765
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
@@ -154,10 +152,10 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
154152
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
155153
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
156154
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
157-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
158-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
155+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
156+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
159157
; CHECK-NEXT: s_waitcnt vmcnt(0)
160-
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
158+
; CHECK-NEXT: image_sample v0, [v0, v1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
161159
; CHECK-NEXT: s_waitcnt vmcnt(0)
162160
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
163161
; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,8 +1074,7 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
10741074
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
10751075
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
10761076
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1077-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1078-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1077+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
10791078
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
10801079
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
10811080
;
@@ -1163,8 +1162,7 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s
11631162
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
11641163
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
11651164
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1166-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1167-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1165+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
11681166
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
11691167
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
11701168
;
@@ -1327,8 +1325,7 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16
13271325
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
13281326
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
13291327
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1330-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1331-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1328+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
13321329
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
13331330
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
13341331
;
@@ -1416,8 +1413,7 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16
14161413
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
14171414
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
14181415
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1419-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1420-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1416+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
14211417
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
14221418
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
14231419
;
@@ -1507,8 +1503,7 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data,
15071503
; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
15081504
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
15091505
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
1510-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
1511-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1506+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
15121507
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
15131508
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
15141509
;
@@ -1754,8 +1749,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
17541749
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
17551750
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
17561751
; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
1757-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
1758-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1752+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
17591753
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
17601754
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
17611755
;
@@ -1851,8 +1845,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
18511845
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
18521846
; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
18531847
; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
1854-
; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
1855-
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
1848+
; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
18561849
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
18571850
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
18581851
;

0 commit comments

Comments
 (0)