@@ -984,3 +984,107 @@ void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float sc
984
984
* out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6 (src , scale );
985
985
* out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6 (src , scale );
986
986
}
987
+
988
+ // CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp8(
989
+ // CHECK-NEXT: entry:
990
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
991
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
992
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
993
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
994
+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
995
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
996
+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
997
+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
998
+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
999
+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1000
+ // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1001
+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1002
+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1003
+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
1004
+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1005
+ // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1006
+ // CHECK-NEXT: ret void
1007
+ //
1008
+ void test_cvt_scalef32_pk_f16_fp8 (global half2 * out , unsigned int src , float scale )
1009
+ {
1010
+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8 (src , scale , true);
1011
+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8 (src , scale , false);
1012
+ }
1013
+
1014
+ // CHECK-LABEL: @test_cvt_scalef32_pk_f16_bf8(
1015
+ // CHECK-NEXT: entry:
1016
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1017
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1018
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1019
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1020
+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1021
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1022
+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1023
+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1024
+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
1025
+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1026
+ // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1027
+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1028
+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1029
+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
1030
+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1031
+ // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1032
+ // CHECK-NEXT: ret void
1033
+ //
1034
+ void test_cvt_scalef32_pk_f16_bf8 (global half2 * out , unsigned int src , float scale )
1035
+ {
1036
+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8 (src , scale , true);
1037
+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8 (src , scale , false);
1038
+ }
1039
+
1040
+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp8(
1041
+ // CHECK-NEXT: entry:
1042
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1043
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1044
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1045
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1046
+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1047
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1048
+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1049
+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1050
+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
1051
+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1052
+ // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1053
+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1054
+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1055
+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
1056
+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1057
+ // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1058
+ // CHECK-NEXT: ret void
1059
+ //
1060
+ void test_cvt_scalef32_pk_bf16_fp8 (global bfloat2 * out , unsigned int src , float scale )
1061
+ {
1062
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8 (src , scale , true);
1063
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8 (src , scale , false);
1064
+ }
1065
+
1066
+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_bf8(
1067
+ // CHECK-NEXT: entry:
1068
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1069
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1070
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1071
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1072
+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1073
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1074
+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1075
+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1076
+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
1077
+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1078
+ // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1079
+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1080
+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1081
+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
1082
+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1083
+ // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1084
+ // CHECK-NEXT: ret void
1085
+ //
1086
+ void test_cvt_scalef32_pk_bf16_bf8 (global bfloat2 * out , unsigned int src , float scale )
1087
+ {
1088
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , true);
1089
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , false);
1090
+ }
0 commit comments