@@ -14,6 +14,8 @@ typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
14
14
typedef float __attribute__ ((ext_vector_type (16 ))) float16 ;
15
15
typedef half __attribute__ ((ext_vector_type (2 ))) half2 ;
16
16
typedef float __attribute__ ((ext_vector_type (2 ))) float2 ;
17
+ typedef half __attribute__ ((ext_vector_type (2 ))) half2 ;
18
+ typedef __bf16 __attribute__ ((ext_vector_type (2 ))) bfloat2 ;
17
19
18
20
// CHECK-LABEL: @test_prng_b32(
19
21
// CHECK-NEXT: entry:
@@ -619,3 +621,123 @@ void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float sc
619
621
* out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 (src , scale , true);
620
622
* out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 (src , scale , false);
621
623
}
624
+
625
+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f16(
626
+ // CHECK-NEXT: entry:
627
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
628
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
629
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
630
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
631
+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
632
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
633
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
634
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
635
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
636
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
637
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
638
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
639
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
640
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
641
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
642
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
643
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
644
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
645
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
646
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
647
+ // CHECK-NEXT: ret void
648
+ //
649
+ void test_cvt_scalef32_pk_fp8_f16 (global short2 * out , half2 src , float scale )
650
+ {
651
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 (* out , src , scale , true);
652
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 (* out , src , scale , false);
653
+ }
654
+
655
+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_bf16(
656
+ // CHECK-NEXT: entry:
657
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
658
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
659
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
660
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
661
+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
662
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
663
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
664
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
665
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
666
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
667
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
668
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
669
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
670
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
671
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
672
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
673
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
674
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
675
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
676
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
677
+ // CHECK-NEXT: ret void
678
+ //
679
+ void test_cvt_scalef32_pk_fp8_bf16 (global short2 * out , bfloat2 src , float scale )
680
+ {
681
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 (* out , src , scale , true);
682
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 (* out , src , scale , false);
683
+ }
684
+
685
+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f16(
686
+ // CHECK-NEXT: entry:
687
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
688
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
689
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
690
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
691
+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
692
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
693
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
694
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
695
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
696
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
697
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
698
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
699
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
700
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
701
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
702
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
703
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
704
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
705
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
706
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
707
+ // CHECK-NEXT: ret void
708
+ //
709
+ void test_cvt_scalef32_pk_bf8_f16 (global short2 * out , half2 src , float scale )
710
+ {
711
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 (* out , src , scale , true);
712
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 (* out , src , scale , false);
713
+ }
714
+
715
+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_bf16(
716
+ // CHECK-NEXT: entry:
717
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
718
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
719
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
720
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
721
+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
722
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
723
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
724
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
725
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
726
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
727
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
728
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
729
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
730
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
731
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
732
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
733
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
734
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
735
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
736
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
737
+ // CHECK-NEXT: ret void
738
+ //
739
+ void test_cvt_scalef32_pk_bf8_bf16 (global short2 * out , bfloat2 src , float scale )
740
+ {
741
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 (* out , src , scale , true);
742
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 (* out , src , scale , false);
743
+ }
0 commit comments