-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Add v_permlane16_swap_b32 and v_permlane32_swap_b32 for gfx950 #117260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis was a bit annoying because these introduce a new special case The 2 registers are swapped, so this has 2 defs. Ideally the builtin Patch is 57.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117260.diff 28 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 51a5b1dbad495c..548bcc8ad55f48 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -459,6 +459,9 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8, "V16fV4iV8iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8, "V16fV4iV8iV16fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
+TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")
+
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ff7132fd8bc1e7..3b3c46b56868cf 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20162,6 +20162,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
return Builder.CreateCall(F, {Arg});
}
+ case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
+ case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
+ // Because builtin types are limited, and the intrinsic uses a struct/pair
+ // output, marshal the pair-of-i32 to <2 x i32>.
+ Value *VDstOld = EmitScalarExpr(E->getArg(0));
+ Value *VSrcOld = EmitScalarExpr(E->getArg(1));
+ Value *FI = EmitScalarExpr(E->getArg(2));
+ Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
+ Function *F =
+ CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
+ ? Intrinsic::amdgcn_permlane16_swap
+ : Intrinsic::amdgcn_permlane32_swap);
+ llvm::CallInst *Call =
+ Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
+
+ llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
+ llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
+
+ llvm::Type *ResultType = ConvertType(E->getType());
+
+ llvm::Value *Insert0 = Builder.CreateInsertElement(
+ llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
+ llvm::Value *AsVector =
+ Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
+ return AsVector;
+ }
case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
return emitBuiltinWithOneOverloadedType<4>(
*this, E, Intrinsic::amdgcn_make_buffer_rsrc);
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 61cbf5e65d0d21..f9e07fbc6b0480 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
index 86f4f73c81c0fc..5b75ee417e545b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
@@ -11,6 +11,10 @@
// REQUIRES: amdgpu-registered-target
typedef unsigned int uint;
-void test_prng_b32(global uint* out, uint a) {
+typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
+
+void test(global uint* out, global uint2* out_v2u32, uint a, uint b) {
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
+ *out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
+ *out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index f31ba85a52a7ad..49f85982faf5a5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -3,6 +3,7 @@
// REQUIRES: amdgpu-registered-target
typedef unsigned int uint;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
@@ -19,3 +20,89 @@ typedef unsigned int uint;
void test_prng_b32(global uint* out, uint a) {
*out = __builtin_amdgcn_prng_b32(a);
}
+
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
+// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
+// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT: ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
+}
+
+// CHECK-LABEL: @test_permlane32_swap(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
+// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
+// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT: ret void
+//
+void test_permlane32_swap(global uint2* out, uint old, uint src) {
+ *out = __builtin_amdgcn_permlane32_swap(old, src, false, false);
+ *out = __builtin_amdgcn_permlane32_swap(old, src, true, false);
+ *out = __builtin_amdgcn_permlane32_swap(old, src, false, true);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index b3b359a1e0c65b..2f1d312da7786c 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -148,3 +148,13 @@ void test_smfmac_f32_32x32x64_fp8_fp8(global float16* out, int4 a, int8 b, float
*out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' must be a constant integer}}
*out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' must be a constant integer}}
}
+
+void test_permlane16_swap(__global int* out, int old, int src, bool X) {
+ *out = __builtin_amdgcn_permlane16_swap(old, src, X, false); // expected-error{{argument to '__builtin_amdgcn_permlane16_swap' must be a constant integer}}
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane16_swap' must be a constant integer}}
+}
+
+void test_permlane32_swap(__global int* out, int old, int src, bool X) {
+ *out = __builtin_amdgcn_permlane32_swap(old, src, X, false); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
+ *out = __builtin_amdgcn_permlane32_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 57523cf0af1b18..e0cde1d3ad87bb 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -27,7 +27,8 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
__global float4* out12, int4 a12, int8 b12, float4 c12,
__global float16* out13, int4 a13, int8 b13, float16 c13,
__global float4* out14, int8 a14, int8 b14, float4 c14, int d14, int e14,
- __global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15) {
+ __global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15,
+ __global uint2* out16, int a16, int b16) {
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
@@ -50,4 +51,6 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
+ *out16 = __builtin_amdgcn_permlane16_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
+ *out16 = __builtin_amdgcn_permlane32_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 161363e0dd6bcc..411a1209ef947e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1407,6 +1407,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4 Emit `v_mfma_scale_f32_32x32x64_f8f6f4`
+ llvm.amdgcn.permlane16.swap Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
+ Swaps the values across lanes of first 2 operands. Odd rows of the first operand are
+ swapped with even rows of the second operand (one row is 16 lanes).
+ Returns a pair for the swapped registers. The first element of the return corresponds
+ to the swapped element of the first argument.
+
+
+ llvm.amdgcn.permlane32.swap Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
+ Swaps the values across lanes of first 2 operands. Rows 2 and 3 of the first operand are
+ swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
+ Returns a pair for the swapped registers. The first element of the return
+ corresponds to the swapped element of the first argument.
+
============================================== ==========================================================
.. TODO::
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ef9547745c6041..755f8c71d67b5e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3168,6 +3168,20 @@ def int_amdgcn_smfmac_f32_32x32x64_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_
def int_amdgcn_smfmac_f32_32x32x64_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
}
+// { vdst_new, vsrc_new } llvm.amdgcn.permlane16.swap <vdst_old> <vsrc_old> <fi> <bound_control>
+def int_amdgcn_permlane16_swap :
+ Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
+ ...
[truncated]
|
llvm#117260) This was a bit annoying because these introduce a new special case encoding usage. op_sel is repurposed as a subset of dpp controls, and is eligible for VOP3->VOP1 shrinking. For some reason fi also uses an enum value, so we need to convert the raw boolean to 1 instead of -1. The 2 registers are swapped, so this has 2 defs. Ideally the builtin would return a pair, but that's difficult so return a vector instead. This would make a hypothetical builtin that supports v2f16 directly uglier.
This was a bit annoying because these introduce a new special case
encoding usage. op_sel is repurposed as a subset of dpp controls,
and is eligible for VOP3->VOP1 shrinking. For some reason fi also
uses an enum value, so we need to convert the raw boolean to 1 instead
of -1.
The 2 registers are swapped, so this has 2 defs. Ideally the builtin
would return a pair, but that's difficult so return a vector instead.
This would make a hypothetical builtin that supports v2f16 directly
uglier.