@@ -2450,4 +2450,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
2450
2450
ret <4 x float > %result
2451
2451
}
2452
2452
2453
+ ; --------------------------------------------------------------------
2454
+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
2455
+ ; --------------------------------------------------------------------
2456
+
2457
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2458
+
2459
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2460
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2461
+ ; SDAG: ; %bb.0: ; %bb
2462
+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2463
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2464
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2465
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2466
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2467
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2468
+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2469
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2470
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2471
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2472
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2473
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2474
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2475
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2476
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2477
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2478
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2479
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2480
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2481
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2482
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2483
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2484
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2485
+ ; SDAG-NEXT: s_nop 0
2486
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2487
+ ; SDAG-NEXT: s_nop 6
2488
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2489
+ ; SDAG-NEXT: s_endpgm
2490
+ ;
2491
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2492
+ ; GISEL: ; %bb.0: ; %bb
2493
+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2494
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2495
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2496
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2497
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2498
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2499
+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2500
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2501
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2502
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2503
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2504
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2505
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2506
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2507
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2508
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2509
+ ; GISEL-NEXT: s_nop 0
2510
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2511
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2512
+ ; GISEL-NEXT: s_nop 5
2513
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2514
+ ; GISEL-NEXT: s_endpgm
2515
+ bb:
2516
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2517
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2518
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2519
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2520
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2521
+ ret void
2522
+ }
2523
+
2524
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2525
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2526
+ ; SDAG: ; %bb.0:
2527
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2528
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2529
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2530
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2531
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2532
+ ; SDAG-NEXT: s_nop 1
2533
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
2534
+ ; SDAG-NEXT: s_nop 6
2535
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2536
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2537
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2538
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2539
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2540
+ ;
2541
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2542
+ ; GISEL: ; %bb.0:
2543
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2544
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
2545
+ ; GISEL-NEXT: s_nop 6
2546
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2547
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2548
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2549
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2550
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2551
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2552
+ ret <4 x float > %result
2553
+ }
2554
+
2555
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2556
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2557
+ ; SDAG: ; %bb.0:
2558
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2559
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2560
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2561
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2562
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2563
+ ; SDAG-NEXT: s_nop 1
2564
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2565
+ ; SDAG-NEXT: s_nop 6
2566
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2567
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2568
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2569
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2570
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2571
+ ;
2572
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2573
+ ; GISEL: ; %bb.0:
2574
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2576
+ ; GISEL-NEXT: s_nop 6
2577
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2578
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2579
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2580
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2581
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2582
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2583
+ ret <4 x float > %result
2584
+ }
2585
+
2586
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2587
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2588
+ ; SDAG: ; %bb.0:
2589
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2590
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2591
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2592
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2593
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2594
+ ; SDAG-NEXT: s_nop 1
2595
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2596
+ ; SDAG-NEXT: s_nop 6
2597
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2598
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2599
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2600
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2601
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2602
+ ;
2603
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2604
+ ; GISEL: ; %bb.0:
2605
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2606
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2607
+ ; GISEL-NEXT: s_nop 6
2608
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2609
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2610
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2611
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2612
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2613
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2614
+ ret <4 x float > %result
2615
+ }
2616
+
2617
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2618
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2619
+ ; SDAG: ; %bb.0:
2620
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2621
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2622
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2623
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2624
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2625
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2626
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2627
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2628
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2629
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2630
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2631
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2632
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2633
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2634
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2635
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2636
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2637
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2638
+ ; SDAG-NEXT: s_nop 1
2639
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
2640
+ ; SDAG-NEXT: s_nop 6
2641
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2642
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2643
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2644
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2645
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2646
+ ;
2647
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2648
+ ; GISEL: ; %bb.0:
2649
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2650
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2651
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2652
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2653
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2654
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2655
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2656
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2657
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2658
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2659
+ ; GISEL-NEXT: s_nop 1
2660
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
2661
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2662
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2663
+ ret <4 x float > %result
2664
+ }
2665
+
2453
2666
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments