@@ -1088,3 +1088,97 @@ void test_cvt_scalef32_pk_bf16_bf8(global bfloat2* out, unsigned int src, float
1088
1088
* out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , true);
1089
1089
* out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , false);
1090
1090
}
1091
+
1092
+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f16(
1093
+ // CHECK-NEXT: entry:
1094
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1095
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
1096
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1097
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1098
+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1099
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1100
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1101
+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1102
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1103
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1104
+ // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i32 0)
1105
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1106
+ // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1107
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1108
+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
1109
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1110
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1111
+ // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i32 1)
1112
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1113
+ // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1114
+ // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1115
+ // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
1116
+ // CHECK-NEXT: [[TMP14:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1117
+ // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1118
+ // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP13]], <2 x half> [[TMP14]], float [[TMP15]], i32 2)
1119
+ // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1120
+ // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
1121
+ // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1122
+ // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4
1123
+ // CHECK-NEXT: [[TMP20:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1124
+ // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1125
+ // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP19]], <2 x half> [[TMP20]], float [[TMP21]], i32 3)
1126
+ // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1127
+ // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
1128
+ // CHECK-NEXT: ret void
1129
+ //
1130
+ void test_cvt_scalef32_pk_fp4_f16 (global unsigned int * out , half2 src , float scale )
1131
+ {
1132
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 0 );
1133
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 1 );
1134
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 2 );
1135
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 3 );
1136
+ }
1137
+
1138
+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_bf16(
1139
+ // CHECK-NEXT: entry:
1140
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1141
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
1142
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1143
+ // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1144
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1145
+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1146
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1147
+ // CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
1148
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1149
+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1150
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1151
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1152
+ // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i32 0)
1153
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1154
+ // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1155
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1156
+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
1157
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1158
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1159
+ // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i32 1)
1160
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1161
+ // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1162
+ // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1163
+ // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
1164
+ // CHECK-NEXT: [[TMP14:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1165
+ // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1166
+ // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP13]], <2 x bfloat> [[TMP14]], float [[TMP15]], i32 2)
1167
+ // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1168
+ // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
1169
+ // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1170
+ // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4
1171
+ // CHECK-NEXT: [[TMP20:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1172
+ // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1173
+ // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP19]], <2 x bfloat> [[TMP20]], float [[TMP21]], i32 3)
1174
+ // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1175
+ // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
1176
+ // CHECK-NEXT: ret void
1177
+ //
1178
+ void test_cvt_scalef32_pk_fp4_bf16 (global unsigned int * out , bfloat2 src , float scale , uint old )
1179
+ {
1180
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 0 );
1181
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 1 );
1182
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 2 );
1183
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 3 );
1184
+ }
0 commit comments