Skip to content

Commit 86d8f39

Browse files
arsenmEthanLuisMcDonough
authored andcommitted
AMDGPU: Fix buffer intrinsic handling for various 16-bit elements. (llvm#95376)
Mostly fixes handling of bfloat vectors, but also some missing i16 cases.
1 parent b6ac330 commit 86d8f39

File tree

6 files changed

+439
-15
lines changed

6 files changed

+439
-15
lines changed

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,19 +1425,23 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
14251425
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
14261426
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
14271427
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
1428+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2bf16, "BUFFER_LOAD_DWORD">;
14281429
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
14291430
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
14301431
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
14311432
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
14321433
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i64, "BUFFER_LOAD_DWORDX2">;
14331434
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f64, "BUFFER_LOAD_DWORDX2">;
1435+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4bf16, "BUFFER_LOAD_DWORDX2">;
14341436
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
14351437
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
14361438
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
14371439
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
14381440
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i64, "BUFFER_LOAD_DWORDX4">;
14391441
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f64, "BUFFER_LOAD_DWORDX4">;
1442+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8i16, "BUFFER_LOAD_DWORDX4">;
14401443
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8f16, "BUFFER_LOAD_DWORDX4">;
1444+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8bf16, "BUFFER_LOAD_DWORDX4">;
14411445

14421446
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
14431447
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
@@ -1532,19 +1536,23 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
15321536
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
15331537
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
15341538
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
1539+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2bf16, "BUFFER_STORE_DWORD">;
15351540
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
15361541
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
15371542
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i64, "BUFFER_STORE_DWORDX2">;
15381543
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f64, "BUFFER_STORE_DWORDX2">;
15391544
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
15401545
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
1546+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4bf16, "BUFFER_STORE_DWORDX2">;
15411547
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
15421548
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
15431549
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
15441550
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
15451551
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i64, "BUFFER_STORE_DWORDX4">;
15461552
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f64, "BUFFER_STORE_DWORDX4">;
15471553
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8f16, "BUFFER_STORE_DWORDX4">;
1554+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8i16, "BUFFER_STORE_DWORDX4">;
1555+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8bf16, "BUFFER_STORE_DWORDX4">;
15481556

15491557
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
15501558
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
859859

860860
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
861861
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862-
MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
862+
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863+
MVT::i8},
863864
Custom);
864865

865866
setOperationAction(ISD::INTRINSIC_W_CHAIN,
866-
{MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
867-
MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
868-
MVT::i16, MVT::i8, MVT::i128},
867+
{MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868+
MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869+
MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870+
MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
869871
Custom);
870872

871873
setOperationAction(ISD::INTRINSIC_VOID,
872-
{MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
873-
MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
874-
MVT::i8, MVT::i128},
874+
{MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875+
MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876+
MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877+
MVT::f16, MVT::i16, MVT::i8, MVT::i128},
875878
Custom);
876879

877880
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
7+
8+
define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
9+
; GFX7-LABEL: raw_ptr_buffer_load_bf16:
10+
; GFX7: ; %bb.0:
11+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
13+
; GFX7-NEXT: s_waitcnt vmcnt(0)
14+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
15+
; GFX7-NEXT: s_setpc_b64 s[30:31]
16+
;
17+
; GFX8-LABEL: raw_ptr_buffer_load_bf16:
18+
; GFX8: ; %bb.0:
19+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20+
; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0
21+
; GFX8-NEXT: s_waitcnt vmcnt(0)
22+
; GFX8-NEXT: s_setpc_b64 s[30:31]
23+
;
24+
; GFX9-LABEL: raw_ptr_buffer_load_bf16:
25+
; GFX9: ; %bb.0:
26+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27+
; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
28+
; GFX9-NEXT: s_waitcnt vmcnt(0)
29+
; GFX9-NEXT: s_setpc_b64 s[30:31]
30+
;
31+
; GFX10-LABEL: raw_ptr_buffer_load_bf16:
32+
; GFX10: ; %bb.0:
33+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GFX10-NEXT: buffer_load_ushort v0, off, s[4:7], 0
35+
; GFX10-NEXT: s_waitcnt vmcnt(0)
36+
; GFX10-NEXT: s_setpc_b64 s[30:31]
37+
;
38+
; GFX11-LABEL: raw_ptr_buffer_load_bf16:
39+
; GFX11: ; %bb.0:
40+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41+
; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
42+
; GFX11-NEXT: s_waitcnt vmcnt(0)
43+
; GFX11-NEXT: s_setpc_b64 s[30:31]
44+
%val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
45+
ret bfloat %val
46+
}
47+
48+
define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) {
49+
; GFX7-LABEL: raw_ptr_buffer_load_v2bf16:
50+
; GFX7: ; %bb.0:
51+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52+
; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0
53+
; GFX7-NEXT: s_waitcnt vmcnt(0)
54+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
55+
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
56+
; GFX7-NEXT: s_setpc_b64 s[30:31]
57+
;
58+
; GFX8-LABEL: raw_ptr_buffer_load_v2bf16:
59+
; GFX8: ; %bb.0:
60+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61+
; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0
62+
; GFX8-NEXT: s_waitcnt vmcnt(0)
63+
; GFX8-NEXT: s_setpc_b64 s[30:31]
64+
;
65+
; GFX9-LABEL: raw_ptr_buffer_load_v2bf16:
66+
; GFX9: ; %bb.0:
67+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68+
; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
69+
; GFX9-NEXT: s_waitcnt vmcnt(0)
70+
; GFX9-NEXT: s_setpc_b64 s[30:31]
71+
;
72+
; GFX10-LABEL: raw_ptr_buffer_load_v2bf16:
73+
; GFX10: ; %bb.0:
74+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75+
; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
76+
; GFX10-NEXT: s_waitcnt vmcnt(0)
77+
; GFX10-NEXT: s_setpc_b64 s[30:31]
78+
;
79+
; GFX11-LABEL: raw_ptr_buffer_load_v2bf16:
80+
; GFX11: ; %bb.0:
81+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82+
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
83+
; GFX11-NEXT: s_waitcnt vmcnt(0)
84+
; GFX11-NEXT: s_setpc_b64 s[30:31]
85+
%val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
86+
ret <2 x bfloat> %val
87+
}
88+
89+
define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) {
90+
; GFX7-LABEL: raw_ptr_buffer_load_v4bf16:
91+
; GFX7: ; %bb.0:
92+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93+
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
94+
; GFX7-NEXT: s_waitcnt vmcnt(0)
95+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
96+
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
97+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
98+
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
99+
; GFX7-NEXT: s_setpc_b64 s[30:31]
100+
;
101+
; GFX8-LABEL: raw_ptr_buffer_load_v4bf16:
102+
; GFX8: ; %bb.0:
103+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104+
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
105+
; GFX8-NEXT: s_waitcnt vmcnt(0)
106+
; GFX8-NEXT: s_setpc_b64 s[30:31]
107+
;
108+
; GFX9-LABEL: raw_ptr_buffer_load_v4bf16:
109+
; GFX9: ; %bb.0:
110+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111+
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
112+
; GFX9-NEXT: s_waitcnt vmcnt(0)
113+
; GFX9-NEXT: s_setpc_b64 s[30:31]
114+
;
115+
; GFX10-LABEL: raw_ptr_buffer_load_v4bf16:
116+
; GFX10: ; %bb.0:
117+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118+
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
119+
; GFX10-NEXT: s_waitcnt vmcnt(0)
120+
; GFX10-NEXT: s_setpc_b64 s[30:31]
121+
;
122+
; GFX11-LABEL: raw_ptr_buffer_load_v4bf16:
123+
; GFX11: ; %bb.0:
124+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125+
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
126+
; GFX11-NEXT: s_waitcnt vmcnt(0)
127+
; GFX11-NEXT: s_setpc_b64 s[30:31]
128+
%val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
129+
ret <4 x bfloat> %val
130+
}
131+
132+
; FIXME
133+
; define <6 x bfloat> @raw_ptr_buffer_load_v6bf16(ptr addrspace(8) inreg %rsrc) {
134+
; %val = call <6 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v6bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
135+
; ret <6 x bfloat> %val
136+
; }
137+
138+
define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) {
139+
; GFX7-LABEL: raw_ptr_buffer_load_v8bf16:
140+
; GFX7: ; %bb.0:
141+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142+
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
143+
; GFX7-NEXT: s_waitcnt vmcnt(0)
144+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
145+
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
146+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
147+
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
148+
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
149+
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
150+
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
151+
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
152+
; GFX7-NEXT: s_setpc_b64 s[30:31]
153+
;
154+
; GFX8-LABEL: raw_ptr_buffer_load_v8bf16:
155+
; GFX8: ; %bb.0:
156+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157+
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
158+
; GFX8-NEXT: s_waitcnt vmcnt(0)
159+
; GFX8-NEXT: s_setpc_b64 s[30:31]
160+
;
161+
; GFX9-LABEL: raw_ptr_buffer_load_v8bf16:
162+
; GFX9: ; %bb.0:
163+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
165+
; GFX9-NEXT: s_waitcnt vmcnt(0)
166+
; GFX9-NEXT: s_setpc_b64 s[30:31]
167+
;
168+
; GFX10-LABEL: raw_ptr_buffer_load_v8bf16:
169+
; GFX10: ; %bb.0:
170+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171+
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
172+
; GFX10-NEXT: s_waitcnt vmcnt(0)
173+
; GFX10-NEXT: s_setpc_b64 s[30:31]
174+
;
175+
; GFX11-LABEL: raw_ptr_buffer_load_v8bf16:
176+
; GFX11: ; %bb.0:
177+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178+
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
179+
; GFX11-NEXT: s_waitcnt vmcnt(0)
180+
; GFX11-NEXT: s_setpc_b64 s[30:31]
181+
%val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
182+
ret <8 x bfloat> %val
183+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -944,32 +944,57 @@ main_body:
944944

945945
define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
946946
; PREGFX10-LABEL: raw_ptr_buffer_load_v4f16:
947-
; PREGFX10: ; %bb.0: ; %main_body
947+
; PREGFX10: ; %bb.0:
948948
; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
949949
; PREGFX10-NEXT: s_mov_b32 m0, -1
950950
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
951951
; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
952952
; PREGFX10-NEXT: s_endpgm
953953
;
954954
; GFX10-LABEL: raw_ptr_buffer_load_v4f16:
955-
; GFX10: ; %bb.0: ; %main_body
955+
; GFX10: ; %bb.0:
956956
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
957957
; GFX10-NEXT: s_waitcnt vmcnt(0)
958958
; GFX10-NEXT: ds_write_b64 v0, v[1:2]
959959
; GFX10-NEXT: s_endpgm
960960
;
961961
; GFX11-LABEL: raw_ptr_buffer_load_v4f16:
962-
; GFX11: ; %bb.0: ; %main_body
962+
; GFX11: ; %bb.0:
963963
; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0
964964
; GFX11-NEXT: s_waitcnt vmcnt(0)
965965
; GFX11-NEXT: ds_store_b64 v0, v[1:2]
966966
; GFX11-NEXT: s_endpgm
967-
main_body:
968967
%val = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
969968
store <4 x half> %val, ptr addrspace(3) %ptr
970969
ret void
971970
}
972971

972+
; FIXME
973+
; define amdgpu_ps void @raw_ptr_buffer_load_v6f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
974+
; %val = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
975+
; store <6 x half> %val, ptr addrspace(3) %ptr
976+
; ret void
977+
; }
978+
979+
define amdgpu_ps void @raw_ptr_buffer_load_v8f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
980+
; GFX10-LABEL: raw_ptr_buffer_load_v8f16:
981+
; GFX10: ; %bb.0:
982+
; GFX10-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0
983+
; GFX10-NEXT: s_waitcnt vmcnt(0)
984+
; GFX10-NEXT: ds_write_b128 v0, v[1:4]
985+
; GFX10-NEXT: s_endpgm
986+
;
987+
; GFX11-LABEL: raw_ptr_buffer_load_v8f16:
988+
; GFX11: ; %bb.0:
989+
; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0
990+
; GFX11-NEXT: s_waitcnt vmcnt(0)
991+
; GFX11-NEXT: ds_store_b128 v0, v[1:4]
992+
; GFX11-NEXT: s_endpgm
993+
%val = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
994+
store <8 x half> %val, ptr addrspace(3) %ptr
995+
ret void
996+
}
997+
973998
define amdgpu_ps void @raw_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
974999
; PREGFX10-LABEL: raw_ptr_buffer_load_v2i16:
9751000
; PREGFX10: ; %bb.0: ; %main_body
@@ -1000,32 +1025,57 @@ main_body:
10001025

10011026
define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
10021027
; PREGFX10-LABEL: raw_ptr_buffer_load_v4i16:
1003-
; PREGFX10: ; %bb.0: ; %main_body
1028+
; PREGFX10: ; %bb.0:
10041029
; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
10051030
; PREGFX10-NEXT: s_mov_b32 m0, -1
10061031
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
10071032
; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
10081033
; PREGFX10-NEXT: s_endpgm
10091034
;
10101035
; GFX10-LABEL: raw_ptr_buffer_load_v4i16:
1011-
; GFX10: ; %bb.0: ; %main_body
1036+
; GFX10: ; %bb.0:
10121037
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
10131038
; GFX10-NEXT: s_waitcnt vmcnt(0)
10141039
; GFX10-NEXT: ds_write_b64 v0, v[1:2]
10151040
; GFX10-NEXT: s_endpgm
10161041
;
10171042
; GFX11-LABEL: raw_ptr_buffer_load_v4i16:
1018-
; GFX11: ; %bb.0: ; %main_body
1043+
; GFX11: ; %bb.0:
10191044
; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0
10201045
; GFX11-NEXT: s_waitcnt vmcnt(0)
10211046
; GFX11-NEXT: ds_store_b64 v0, v[1:2]
10221047
; GFX11-NEXT: s_endpgm
1023-
main_body:
10241048
%val = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
10251049
store <4 x i16> %val, ptr addrspace(3) %ptr
10261050
ret void
10271051
}
10281052

1053+
; FIXME
1054+
; define amdgpu_ps void @raw_ptr_buffer_load_v6i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
1055+
; %val = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
1056+
; store <6 x i16> %val, ptr addrspace(3) %ptr
1057+
; ret void
1058+
; }
1059+
1060+
define amdgpu_ps void @raw_ptr_buffer_load_v8i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
1061+
; GFX10-LABEL: raw_ptr_buffer_load_v8i16:
1062+
; GFX10: ; %bb.0:
1063+
; GFX10-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0
1064+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1065+
; GFX10-NEXT: ds_write_b128 v0, v[1:4]
1066+
; GFX10-NEXT: s_endpgm
1067+
;
1068+
; GFX11-LABEL: raw_ptr_buffer_load_v8i16:
1069+
; GFX11: ; %bb.0:
1070+
; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0
1071+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1072+
; GFX11-NEXT: ds_store_b128 v0, v[1:4]
1073+
; GFX11-NEXT: s_endpgm
1074+
%val = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
1075+
store <8 x i16> %val, ptr addrspace(3) %ptr
1076+
ret void
1077+
}
1078+
10291079
define amdgpu_ps void @raw_ptr_buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) {
10301080
; PREGFX10-LABEL: raw_ptr_buffer_load_x1_offset_merged:
10311081
; PREGFX10: ; %bb.0: ; %main_body

0 commit comments

Comments
 (0)