@@ -2663,4 +2663,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
2663
2663
ret <4 x float > %result
2664
2664
}
2665
2665
2666
+ ; --------------------------------------------------------------------
2667
+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
2668
+ ; --------------------------------------------------------------------
2669
+
2670
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2671
+
2672
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2673
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2674
+ ; SDAG: ; %bb.0: ; %bb
2675
+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2676
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2677
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2678
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2679
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2680
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2681
+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2682
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2683
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2684
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2685
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2686
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2687
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2688
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2689
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2690
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2691
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2692
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2693
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2694
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2695
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2696
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2697
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2698
+ ; SDAG-NEXT: s_nop 0
2699
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2700
+ ; SDAG-NEXT: s_nop 6
2701
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2702
+ ; SDAG-NEXT: s_endpgm
2703
+ ;
2704
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2705
+ ; GISEL: ; %bb.0: ; %bb
2706
+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2707
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2708
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2709
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2710
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2711
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2712
+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2713
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2714
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2715
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2716
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2717
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2718
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2719
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2720
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2721
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2722
+ ; GISEL-NEXT: s_nop 0
2723
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2724
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2725
+ ; GISEL-NEXT: s_nop 5
2726
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2727
+ ; GISEL-NEXT: s_endpgm
2728
+ bb:
2729
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2730
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2731
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2732
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2733
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2734
+ ret void
2735
+ }
2736
+
2737
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2738
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2739
+ ; SDAG: ; %bb.0:
2740
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2741
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2742
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2743
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2744
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2745
+ ; SDAG-NEXT: s_nop 1
2746
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
2747
+ ; SDAG-NEXT: s_nop 6
2748
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2749
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2750
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2751
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2752
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2753
+ ;
2754
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2755
+ ; GISEL: ; %bb.0:
2756
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2757
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
2758
+ ; GISEL-NEXT: s_nop 6
2759
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2760
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2761
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2762
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2763
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2764
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2765
+ ret <4 x float > %result
2766
+ }
2767
+
2768
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2769
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2770
+ ; SDAG: ; %bb.0:
2771
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2772
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2773
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2774
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2775
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2776
+ ; SDAG-NEXT: s_nop 1
2777
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2778
+ ; SDAG-NEXT: s_nop 6
2779
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2780
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2781
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2782
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2783
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2784
+ ;
2785
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2786
+ ; GISEL: ; %bb.0:
2787
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2788
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2789
+ ; GISEL-NEXT: s_nop 6
2790
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2791
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2792
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2793
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2794
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2795
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2796
+ ret <4 x float > %result
2797
+ }
2798
+
2799
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2800
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2801
+ ; SDAG: ; %bb.0:
2802
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2804
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2805
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2806
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2807
+ ; SDAG-NEXT: s_nop 1
2808
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2809
+ ; SDAG-NEXT: s_nop 6
2810
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2811
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2812
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2813
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2814
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2815
+ ;
2816
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2817
+ ; GISEL: ; %bb.0:
2818
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2819
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2820
+ ; GISEL-NEXT: s_nop 6
2821
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2822
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2823
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2824
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2825
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2826
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2827
+ ret <4 x float > %result
2828
+ }
2829
+
2830
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2831
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2832
+ ; SDAG: ; %bb.0:
2833
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2834
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2835
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2836
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2837
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2838
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2839
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2840
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2841
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2842
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2843
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2844
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2845
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2846
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2847
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2848
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2849
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2850
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2851
+ ; SDAG-NEXT: s_nop 1
2852
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
2853
+ ; SDAG-NEXT: s_nop 6
2854
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2855
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2856
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2857
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2858
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2859
+ ;
2860
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2861
+ ; GISEL: ; %bb.0:
2862
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2863
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2864
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2865
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2866
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2867
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2868
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2869
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2870
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2871
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2872
+ ; GISEL-NEXT: s_nop 1
2873
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
2874
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2875
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2876
+ ret <4 x float > %result
2877
+ }
2878
+
2666
2879
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments