@@ -2110,6 +2110,221 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
2110
2110
ret <16 x i32 > %result
2111
2111
}
2112
2112
2113
+ ; --------------------------------------------------------------------
2114
+ ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
2115
+ ; --------------------------------------------------------------------
2116
+
2117
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2118
+
2119
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2120
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
2121
+ ; SDAG: ; %bb.0: ; %bb
2122
+ ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2123
+ ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2124
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2125
+ ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2126
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2127
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2128
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2129
+ ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2130
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2131
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2132
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2133
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2134
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2135
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2136
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2137
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2138
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2139
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2140
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2141
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2142
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2143
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2144
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2145
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2146
+ ; SDAG-NEXT: s_nop 0
2147
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2148
+ ; SDAG-NEXT: s_nop 6
2149
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2150
+ ; SDAG-NEXT: s_endpgm
2151
+ ;
2152
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
2153
+ ; GISEL: ; %bb.0: ; %bb
2154
+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2155
+ ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2156
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2157
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2158
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2159
+ ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2160
+ ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2161
+ ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2162
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2163
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2164
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2165
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2166
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2167
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2168
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2169
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2170
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2171
+ ; GISEL-NEXT: s_nop 0
2172
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2173
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2174
+ ; GISEL-NEXT: s_nop 5
2175
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2176
+ ; GISEL-NEXT: s_endpgm
2177
+ bb:
2178
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2179
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2180
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2181
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2182
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2183
+ ret void
2184
+ }
2185
+
2186
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2187
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
2188
+ ; SDAG: ; %bb.0:
2189
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2190
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2191
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2192
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2193
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2194
+ ; SDAG-NEXT: s_nop 1
2195
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
2196
+ ; SDAG-NEXT: s_nop 6
2197
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2198
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2199
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2200
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2201
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2202
+ ;
2203
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
2204
+ ; GISEL: ; %bb.0:
2205
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2206
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
2207
+ ; GISEL-NEXT: s_nop 6
2208
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2209
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2210
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2211
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2212
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2213
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2214
+ ret <4 x float > %result
2215
+ }
2216
+
2217
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2218
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
2219
+ ; SDAG: ; %bb.0:
2220
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2221
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2222
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2223
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2224
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2225
+ ; SDAG-NEXT: s_nop 1
2226
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2227
+ ; SDAG-NEXT: s_nop 6
2228
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2229
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2230
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2231
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2232
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2233
+ ;
2234
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
2235
+ ; GISEL: ; %bb.0:
2236
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2237
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2238
+ ; GISEL-NEXT: s_nop 6
2239
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2240
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2241
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2242
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2243
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2244
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2245
+ ret <4 x float > %result
2246
+ }
2247
+
2248
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2249
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
2250
+ ; SDAG: ; %bb.0:
2251
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2252
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2253
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2254
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2255
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2256
+ ; SDAG-NEXT: s_nop 1
2257
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2258
+ ; SDAG-NEXT: s_nop 6
2259
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2260
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2261
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2262
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2263
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2264
+ ;
2265
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
2266
+ ; GISEL: ; %bb.0:
2267
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2268
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2269
+ ; GISEL-NEXT: s_nop 6
2270
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2271
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2272
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2273
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2274
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2275
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2276
+ ret <4 x float > %result
2277
+ }
2278
+
2279
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2280
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
2281
+ ; SDAG: ; %bb.0:
2282
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2283
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2284
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2285
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2286
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2287
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2288
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2289
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2290
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2291
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2292
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2293
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2294
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2295
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2296
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2297
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2298
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2299
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2300
+ ; SDAG-NEXT: s_nop 1
2301
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
2302
+ ; SDAG-NEXT: s_nop 6
2303
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2304
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2305
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2306
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2307
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2308
+ ;
2309
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
2310
+ ; GISEL: ; %bb.0:
2311
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2312
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2313
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2314
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2315
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2316
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2317
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2318
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2319
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2320
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2321
+ ; GISEL-NEXT: s_nop 1
2322
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
2323
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2324
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2325
+ ret <4 x float > %result
2326
+ }
2327
+
2113
2328
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
2114
2329
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2115
2330
; GCN: {{.*}}
0 commit comments