-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Remove FeatureCvtFP8VOP1Bug from gfx950 #117827
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/remove-feature-cvt-fp8-vop1-bug
Nov 27, 2024
Merged
AMDGPU: Remove FeatureCvtFP8VOP1Bug from gfx950 #117827
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/remove-feature-cvt-fp8-vop1-bug
Nov 27, 2024
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Nov 27, 2024
Merged
6a5a2df
to
e9df017
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 256.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117827.diff 35 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fec0838823e9a1..752a751a25a0c7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -598,6 +598,27 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_bf8, "V2hUifIb", "nc", "bf8-
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf16_bf8, "V2yUifIb", "nc", "bf8-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp4_f16, "UiUiV2hfIi", "nc", "fp4-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp4_bf16, "UiUiV2yfIi", "nc", "fp4-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16, "UiUiV2hUifIi", "nc", "fp4-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16, "UiUiV2yUifIi", "nc", "fp4-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32, "UiUiV2fUifIi", "nc", "fp4-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_bf16, "iiyUifIi", "nc", "bf8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_f16, "iihUifIi", "nc", "bf8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_bf8_f32, "iifUifIi", "nc", "bf8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_bf16, "iiyUifIi", "nc", "fp8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_f16, "iihUifIi", "nc", "fp8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_fp8_f32, "iifUifIi", "nc", "fp8-cvt-scale-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_bitop3_b32, "iiiiIUc", "nc", "bitop3-insts")
+TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUc", "nc", "bitop3-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 834e588c18e376..3317c4e93199b0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12787,5 +12787,5 @@ def err_acc_loop_not_monotonic
// AMDGCN builtins diagnostics
def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
-def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be 1, 2, or 4">;
+def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
} // end of sema component.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f32d5a2f43559a..e0504c0e38b22a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -748,6 +748,18 @@ static Value *emitBuiltinWithOneOverloadedType(CodeGenFunction &CGF,
return CGF.Builder.CreateCall(F, Args, Name);
}
+// Emit an intrinsic that has 4 operands of the same type as its result.
+static Value *emitQuaternaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
+ unsigned IntrinsicID) {
+ llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
+ llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
+ llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
+ llvm::Value *Src3 = CGF.EmitScalarExpr(E->getArg(3));
+
+ Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
+ return CGF.Builder.CreateCall(F, {Src0, Src1, Src2, Src3});
+}
+
// Emit an intrinsic that has 1 float or double operand, and 1 integer.
static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
const CallExpr *E,
@@ -20250,6 +20262,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
return AsVector;
}
+ case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
+ case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
+ return emitQuaternaryBuiltin(*this, E, Intrinsic::amdgcn_bitop3);
case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
return emitBuiltinWithOneOverloadedType<4>(
*this, E, Intrinsic::amdgcn_make_buffer_rsrc);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 02710e51bc0d19..a4d075dfd0768c 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -26,6 +26,14 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
CallExpr *TheCall) {
// position of memory order and scope arguments in the builtin
unsigned OrderIndex, ScopeIndex;
+
+ const auto *FD = SemaRef.getCurFunctionDecl();
+ assert(FD && "AMDGPU builtins should not be used outside of a function");
+ llvm::StringMap<bool> CallerFeatureMap;
+ getASTContext().getFunctionFeatureMap(CallerFeatureMap, FD);
+ bool HasGFX950Insts =
+ Builtin::evaluateRequiredTargetFeatures("gfx950-insts", CallerFeatureMap);
+
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
@@ -39,13 +47,19 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
case 2:
case 4:
return false;
+ case 12:
+ case 16: {
+ if (HasGFX950Insts)
+ return false;
+ [[fallthrough]];
+ }
default:
Diag(ArgExpr->getExprLoc(),
diag::err_amdgcn_global_load_lds_size_invalid_value)
<< ArgExpr->getSourceRange();
Diag(ArgExpr->getExprLoc(),
diag::note_amdgcn_global_load_lds_size_valid_value)
- << ArgExpr->getSourceRange();
+ << HasGFX950Insts << ArgExpr->getSourceRange();
return true;
}
}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index d8f3025046d2e0..633f1dec5e3705 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
index d91db0a4afa868..521121f5e7e543 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
@@ -11,6 +11,7 @@
// REQUIRES: amdgpu-registered-target
typedef unsigned int uint;
+typedef unsigned short ushort;
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
typedef half __attribute__((ext_vector_type(2))) half2;
typedef short __attribute__((ext_vector_type(2))) short2;
@@ -21,8 +22,8 @@ typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
typedef half __attribute__((ext_vector_type(32))) half32;
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
-void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale, global short2* out_v2i16, float src0, float src1,
- global float2* out_v2f32, half2 src0_v2f16, bfloat2 src0_v2bf16, global bfloat2* out_v2bf16, global float32* out_v32f32, uint6 src_v6i32,
+void test(global uint* out, global uint2* out_v2u32, uint a, uint b, uint c, global half2* out_v2f16, global float* out_f32, float scale, unsigned seed, global short2* out_v2i16, float src0, float src1,
+ float2 src0_v2f32, global float2* out_v2f32, half2 src0_v2f16, bfloat2 src0_v2bf16, global bfloat2* out_v2bf16, global float32* out_v32f32, uint6 src_v6i32,
global half32 *out_v32f16, global bfloat32 *out_v32bf16) {
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
*out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
@@ -55,4 +56,11 @@ void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half
*out_v2bf16 = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(a, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_bf16_bf8' needs target feature bf8-cvt-scale-insts}}
*out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src0_v2f16, scale, 3); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_fp4_f16' needs target feature fp4-cvt-scale-insts}}
*out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src0_v2bf16, scale, 3); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_fp4_bf16' needs target feature fp4-cvt-scale-insts}}
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src0_v2f16, 0, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16' needs target feature fp4-cvt-scale-insts}}
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src0_v2bf16, 0, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16' needs target feature fp4-cvt-scale-insts}}
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src0_v2f32, 0, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32' needs target feature fp4-cvt-scale-insts}}
+ *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1); // expected-error {{'__builtin_amdgcn_bitop3_b32' needs target feature bitop3-insts}}
+ *out = __builtin_amdgcn_bitop3_b16((ushort)a, (ushort)b, (ushort)c, 1); // expected-error {{'__builtin_amdgcn_bitop3_b16' needs target feature bitop3-insts}}
+ *out_v2bf16 = __builtin_amdgcn_cvt_sr_bf16_f32(*out_v2bf16, src0, seed, 0); // expected-error{{'__builtin_amdgcn_cvt_sr_bf16_f32' needs target feature f32-to-f16bf16-cvt-sr-insts}}
+ *out_v2f16 = __builtin_amdgcn_cvt_sr_f16_f32(*out_v2f16, src0, seed, 0); // expected-error{{'__builtin_amdgcn_cvt_sr_f16_f32' needs target feature f32-to-f16bf16-cvt-sr-insts}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index ac4c43e1db7bde..f1259ef678f1e0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -5,6 +5,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
+typedef unsigned short ushort;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
@@ -1182,3 +1183,615 @@ void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float
*out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 2);
*out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 3);
}
+
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP8]], <2 x half> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
+// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP15]], <2 x half> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
+// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
+// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
+// CHECK-NEXT: [[TMP23:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP22]], <2 x half> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
+// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_scalef32_sr_pk_fp4_f16(global unsigned *out, half2 src, uint seed, float scale)
+{
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 0);
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 1);
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 2);
+ *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 3);
+}
+
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], i32 [[TMP3]], float [[TMP4...
[truncated]
|
5f45ccd
to
5241f29
Compare
shiltian
approved these changes
Nov 27, 2024
e9df017
to
1d53ad5
Compare
5241f29
to
f83b18c
Compare
This was referenced Nov 27, 2024
1d53ad5
to
2e790ba
Compare
Base automatically changed from
users/arsenm/gfx950/verify-f8f6f4-format-assembler
to
main
November 27, 2024 04:45
f83b18c
to
ff296c1
Compare
ff296c1
to
11464f0
Compare
This was referenced Dec 2, 2024
searlmc1
pushed a commit
to ROCm/llvm-project
that referenced
this pull request
Feb 3, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.