@@ -2237,4 +2237,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
2237
2237
ret <4 x float > %result
2238
2238
}
2239
2239
2240
+ ; --------------------------------------------------------------------
2241
+ ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
2242
+ ; --------------------------------------------------------------------
2243
+
2244
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2245
+
2246
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2247
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2248
+ ; SDAG: ; %bb.0: ; %bb
2249
+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2250
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2251
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2252
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2253
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2254
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2255
+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2256
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2257
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2258
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2259
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2260
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2261
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2262
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2263
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2264
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2265
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2266
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2267
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2268
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2269
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2270
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2271
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2272
+ ; SDAG-NEXT: s_nop 0
2273
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2274
+ ; SDAG-NEXT: s_nop 6
2275
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2276
+ ; SDAG-NEXT: s_endpgm
2277
+ ;
2278
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2279
+ ; GISEL: ; %bb.0: ; %bb
2280
+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2281
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2282
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2283
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2284
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2285
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2286
+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2287
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2288
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2289
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2290
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2291
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2292
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2293
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2294
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2295
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2296
+ ; GISEL-NEXT: s_nop 0
2297
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2298
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2299
+ ; GISEL-NEXT: s_nop 5
2300
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2301
+ ; GISEL-NEXT: s_endpgm
2302
+ bb:
2303
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2304
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2305
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2306
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2307
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2308
+ ret void
2309
+ }
2310
+
2311
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2312
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2313
+ ; SDAG: ; %bb.0:
2314
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2315
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2316
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2317
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2318
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2319
+ ; SDAG-NEXT: s_nop 1
2320
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
2321
+ ; SDAG-NEXT: s_nop 6
2322
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2323
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2324
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2325
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2326
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2327
+ ;
2328
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2329
+ ; GISEL: ; %bb.0:
2330
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2331
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
2332
+ ; GISEL-NEXT: s_nop 6
2333
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2334
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2335
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2336
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2337
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2338
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2339
+ ret <4 x float > %result
2340
+ }
2341
+
2342
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2343
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2344
+ ; SDAG: ; %bb.0:
2345
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2346
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2347
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2348
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2349
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2350
+ ; SDAG-NEXT: s_nop 1
2351
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2352
+ ; SDAG-NEXT: s_nop 6
2353
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2354
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2355
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2356
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2357
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2358
+ ;
2359
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2360
+ ; GISEL: ; %bb.0:
2361
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2362
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2363
+ ; GISEL-NEXT: s_nop 6
2364
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2365
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2366
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2367
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2368
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2369
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2370
+ ret <4 x float > %result
2371
+ }
2372
+
2373
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2374
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2375
+ ; SDAG: ; %bb.0:
2376
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2377
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2378
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2379
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2380
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2381
+ ; SDAG-NEXT: s_nop 1
2382
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2383
+ ; SDAG-NEXT: s_nop 6
2384
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2385
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2386
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2387
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2388
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2389
+ ;
2390
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2391
+ ; GISEL: ; %bb.0:
2392
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2393
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2394
+ ; GISEL-NEXT: s_nop 6
2395
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2396
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2397
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2398
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2399
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2400
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2401
+ ret <4 x float > %result
2402
+ }
2403
+
2404
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2405
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2406
+ ; SDAG: ; %bb.0:
2407
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2408
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2409
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2410
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2411
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2412
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2413
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2414
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2415
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2416
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2417
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2418
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2419
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2420
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2421
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2422
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2423
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2424
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2425
+ ; SDAG-NEXT: s_nop 1
2426
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
2427
+ ; SDAG-NEXT: s_nop 6
2428
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2429
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2430
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2431
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2432
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2433
+ ;
2434
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2435
+ ; GISEL: ; %bb.0:
2436
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2438
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2439
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2440
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2441
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2442
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2443
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2444
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2445
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2446
+ ; GISEL-NEXT: s_nop 1
2447
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
2448
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2449
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2450
+ ret <4 x float > %result
2451
+ }
2452
+
2240
2453
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments