@@ -112,85 +112,85 @@ bb:
112
112
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
113
113
define amdgpu_kernel void @swmmac_f32_16x16x32_f16 (<8 x half > %A , <16 x half > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
114
114
bb:
115
- %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.f16 (<8 x half > %A , <16 x half > %B , <8 x float > %C , i16 %Index )
115
+ %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16 (<8 x half > %A , <16 x half > %B , <8 x float > %C , i16 %Index )
116
116
store <8 x float > %tmp0 , ptr addrspace (1 ) %out , align 32
117
117
ret void
118
118
}
119
119
120
120
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
121
121
define amdgpu_kernel void @swmmac_f32_16x16x32_bf16 (<8 x i16 > %A , <16 x i16 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
122
122
bb:
123
- %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf16 (<8 x i16 > %A , <16 x i16 > %B , <8 x float > %C , i16 %Index )
123
+ %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16 (<8 x i16 > %A , <16 x i16 > %B , <8 x float > %C , i16 %Index )
124
124
store <8 x float > %tmp0 , ptr addrspace (1 ) %out , align 32
125
125
ret void
126
126
}
127
127
128
128
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
129
129
define amdgpu_kernel void @swmmac_f16_16x16x32_f16 (<8 x half > %A , <16 x half > %B , <8 x half > %C , i16 %Index , ptr addrspace (1 ) %out ) {
130
130
bb:
131
- %tmp0 = call <8 x half > @llvm.amdgcn.swmmac.f16.16x16x32.f16 (<8 x half > %A , <16 x half > %B , <8 x half > %C , i16 %Index )
131
+ %tmp0 = call <8 x half > @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16 (<8 x half > %A , <16 x half > %B , <8 x half > %C , i16 %Index )
132
132
store <8 x half > %tmp0 , ptr addrspace (1 ) %out , align 32
133
133
ret void
134
134
}
135
135
136
136
; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
137
137
define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16 (<8 x i16 > %A , <16 x i16 > %B , <8 x i16 > %C , i16 %Index , ptr addrspace (1 ) %out ) {
138
138
bb:
139
- %tmp0 = call <8 x i16 > @llvm.amdgcn.swmmac.bf16.16x16x32.bf16 (<8 x i16 > %A , <16 x i16 > %B , <8 x i16 > %C , i16 %Index )
139
+ %tmp0 = call <8 x i16 > @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16 (<8 x i16 > %A , <16 x i16 > %B , <8 x i16 > %C , i16 %Index )
140
140
store <8 x i16 > %tmp0 , ptr addrspace (1 ) %out , align 32
141
141
ret void
142
142
}
143
143
144
144
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
145
145
define amdgpu_kernel void @swmmac_i32_16x16x32_iu8 (<2 x i32 > %A , <4 x i32 > %B , <8 x i32 > %C , i16 %Index , ptr addrspace (1 ) %out ) {
146
146
bb:
147
- %tmp0 = call <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu8 (i1 false , <2 x i32 > %A , i1 false , <4 x i32 > %B , <8 x i32 > %C , i16 %Index , i1 false )
147
+ %tmp0 = call <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16 (i1 false , <2 x i32 > %A , i1 false , <4 x i32 > %B , <8 x i32 > %C , i16 %Index , i1 false )
148
148
store <8 x i32 > %tmp0 , ptr addrspace (1 ) %out , align 32
149
149
ret void
150
150
}
151
151
152
152
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
153
153
define amdgpu_kernel void @swmmac_i32_16x16x32_iu4 (i32 %A , <2 x i32 > %B , <8 x i32 > %C , i16 %Index , ptr addrspace (1 ) %out ) {
154
154
bb:
155
- %tmp0 = call <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu4 (i1 false , i32 %A , i1 false , <2 x i32 > %B , <8 x i32 > %C , i16 %Index , i1 false )
155
+ %tmp0 = call <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16 (i1 false , i32 %A , i1 false , <2 x i32 > %B , <8 x i32 > %C , i16 %Index , i1 false )
156
156
store <8 x i32 > %tmp0 , ptr addrspace (1 ) %out , align 32
157
157
ret void
158
158
}
159
159
160
160
; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
161
161
define amdgpu_kernel void @swmmac_i32_16x16x64_iu4 (<2 x i32 > %A , <4 x i32 > %B , <8 x i32 > %C , i16 %Index , ptr addrspace (1 ) %out ) {
162
162
bb:
163
- %tmp0 = call <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x64.iu4 (i1 false , <2 x i32 > %A , i1 false , <4 x i32 > %B , <8 x i32 > %C , i16 %Index , i1 false )
163
+ %tmp0 = call <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16 (i1 false , <2 x i32 > %A , i1 false , <4 x i32 > %B , <8 x i32 > %C , i16 %Index , i1 false )
164
164
store <8 x i32 > %tmp0 , ptr addrspace (1 ) %out , align 32
165
165
ret void
166
166
}
167
167
168
168
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
169
169
define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
170
170
bb:
171
- %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index )
171
+ %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index )
172
172
store <8 x float > %tmp0 , ptr addrspace (1 ) %out , align 32
173
173
ret void
174
174
}
175
175
176
176
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
177
177
define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
178
178
bb:
179
- %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index )
179
+ %tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index )
180
180
store <8 x float > %tmp0 , ptr addrspace (1 ) %out , align 32
181
181
ret void
182
182
}
183
183
184
184
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
185
- define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
185
+ define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8.v8f32.v2i32.v4i32.i16 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
186
186
bb:
187
187
%tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index )
188
188
store <8 x float > %tmp0 , ptr addrspace (1 ) %out , align 32
189
189
ret void
190
190
}
191
191
192
192
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
193
- define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
193
+ define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8.v8f32.v2i32.v4i32.i16 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index , ptr addrspace (1 ) %out ) {
194
194
bb:
195
195
%tmp0 = call <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8 (<2 x i32 > %A , <4 x i32 > %B , <8 x float > %C , i16 %Index )
196
196
store <8 x float > %tmp0 , ptr addrspace (1 ) %out , align 32
@@ -284,13 +284,13 @@ declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>
284
284
declare <16 x i16 > @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16 (<16 x i16 >, <16 x i16 > , <16 x i16 >, i1 immarg) #1
285
285
declare <8 x i32 > @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32 (i1 immarg, <4 x i32 >, i1 immarg, <4 x i32 > , <8 x i32 >, i1 immarg) #1
286
286
declare <8 x i32 > @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32 (i1 immarg, <2 x i32 >, i1 immarg, <2 x i32 > , <8 x i32 >, i1 immarg) #1
287
- declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.f16 (<8 x half >, <16 x half >, <8 x float >, i16 )
288
- declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf16 (<8 x i16 >, <16 x i16 >, <8 x float >, i16 )
289
- declare <8 x half > @llvm.amdgcn.swmmac.f16.16x16x32.f16 (<8 x half >, <16 x half >, <8 x half >, i16 )
290
- declare <8 x i16 > @llvm.amdgcn.swmmac.bf16.16x16x32.bf16 (<8 x i16 >, <16 x i16 >, <8 x i16 >, i16 )
291
- declare <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu8 (i1 immarg, <2 x i32 >, i1 immarg, <4 x i32 >, <8 x i32 >, i16 , i1 )
292
- declare <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu4 (i1 immarg, i32 , i1 immarg, <2 x i32 >, <8 x i32 >, i16 , i1 )
293
- declare <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x64.iu4 (i1 immarg, <2 x i32 >, i1 immarg, <4 x i32 >, <8 x i32 >, i16 , i1 )
287
+ declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16 (<8 x half >, <16 x half >, <8 x float >, i16 )
288
+ declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16 (<8 x i16 >, <16 x i16 >, <8 x float >, i16 )
289
+ declare <8 x half > @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16 (<8 x half >, <16 x half >, <8 x half >, i16 )
290
+ declare <8 x i16 > @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16 (<8 x i16 >, <16 x i16 >, <8 x i16 >, i16 )
291
+ declare <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16 (i1 immarg, <2 x i32 >, i1 immarg, <4 x i32 >, <8 x i32 >, i16 , i1 )
292
+ declare <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16 (i1 immarg, i32 , i1 immarg, <2 x i32 >, <8 x i32 >, i16 , i1 )
293
+ declare <8 x i32 > @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16 (i1 immarg, <2 x i32 >, i1 immarg, <4 x i32 >, <8 x i32 >, i16 , i1 )
294
294
declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8 (<2 x i32 >, <4 x i32 >, <8 x float >, i16 )
295
295
declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8 (<2 x i32 >, <4 x i32 >, <8 x float >, i16 )
296
296
declare <8 x float > @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8 (<2 x i32 >, <4 x i32 >, <8 x float >, i16 )
0 commit comments