Skip to content

AMDGPU: Add v_permlane16_swap_b32 and v_permlane32_swap_b32 for gfx950 #117260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,9 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8, "V16fV4iV8iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8, "V16fV4iV8iV16fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiIi", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")

//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
Expand Down
26 changes: 26 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20200,6 +20200,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
return Builder.CreateCall(F, {Arg});
}
case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
// Because builtin types are limited, and the intrinsic uses a struct/pair
// output, marshal the pair-of-i32 to <2 x i32>.
Value *VDstOld = EmitScalarExpr(E->getArg(0));
Value *VSrcOld = EmitScalarExpr(E->getArg(1));
Value *FI = EmitScalarExpr(E->getArg(2));
Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
Function *F =
CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
? Intrinsic::amdgcn_permlane16_swap
: Intrinsic::amdgcn_permlane32_swap);
llvm::CallInst *Call =
Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});

llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);

llvm::Type *ResultType = ConvertType(E->getType());

llvm::Value *Insert0 = Builder.CreateInsertElement(
llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
llvm::Value *AsVector =
Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
return AsVector;
}
case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
return emitBuiltinWithOneOverloadedType<4>(
*this, E, Intrinsic::amdgcn_make_buffer_rsrc);
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
Expand Down
6 changes: 5 additions & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
// REQUIRES: amdgpu-registered-target

typedef unsigned int uint;
void test_prng_b32(global uint* out, uint a) {
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));

void test(global uint* out, global uint2* out_v2u32, uint a, uint b) {
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
*out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
*out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
}
87 changes: 87 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// REQUIRES: amdgpu-registered-target

typedef unsigned int uint;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;

// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
Expand All @@ -19,3 +20,89 @@ typedef unsigned int uint;
void test_prng_b32(global uint* out, uint a) {
*out = __builtin_amdgcn_prng_b32(a);
}

// CHECK-LABEL: @test_permlane16_swap(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
// CHECK-NEXT: ret void
//
void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
*out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}

// CHECK-LABEL: @test_permlane32_swap(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
// CHECK-NEXT: ret void
//
void test_permlane32_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane32_swap(old, src, false, false);
*out = __builtin_amdgcn_permlane32_swap(old, src, true, false);
*out = __builtin_amdgcn_permlane32_swap(old, src, false, true);
}
10 changes: 10 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,13 @@ void test_smfmac_f32_32x32x64_fp8_fp8(global float16* out, int4 a, int8 b, float
*out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' must be a constant integer}}
*out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' must be a constant integer}}
}

void test_permlane16_swap(__global int* out, int old, int src, bool X) {
*out = __builtin_amdgcn_permlane16_swap(old, src, X, false); // expected-error{{argument to '__builtin_amdgcn_permlane16_swap' must be a constant integer}}
*out = __builtin_amdgcn_permlane16_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane16_swap' must be a constant integer}}
}

void test_permlane32_swap(__global int* out, int old, int src, bool X) {
*out = __builtin_amdgcn_permlane32_swap(old, src, X, false); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
*out = __builtin_amdgcn_permlane32_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
}
5 changes: 4 additions & 1 deletion clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
__global float4* out12, int4 a12, int8 b12, float4 c12,
__global float16* out13, int4 a13, int8 b13, float16 c13,
__global float4* out14, int8 a14, int8 b14, float4 c14, int d14, int e14,
__global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15) {
__global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15,
__global uint2* out16, int a16, int b16) {
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
Expand All @@ -50,4 +51,6 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
*out16 = __builtin_amdgcn_permlane16_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
*out16 = __builtin_amdgcn_permlane32_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
}
13 changes: 13 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1407,6 +1407,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.

llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4 Emit `v_mfma_scale_f32_32x32x64_f8f6f4`

llvm.amdgcn.permlane16.swap Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
Swaps the values across lanes of first 2 operands. Odd rows of the first operand are
swapped with even rows of the second operand (one row is 16 lanes).
Returns a pair for the swapped registers. The first element of the return corresponds
to the swapped element of the first argument.


llvm.amdgcn.permlane32.swap Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
Swaps the values across lanes of first 2 operands. Rows 2 and 3 of the first operand are
swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
Returns a pair for the swapped registers. The first element of the return
corresponds to the swapped element of the first argument.

============================================== ==========================================================

.. TODO::
Expand Down
14 changes: 14 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3166,6 +3166,20 @@ def int_amdgcn_smfmac_f32_32x32x64_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_
def int_amdgcn_smfmac_f32_32x32x64_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
}

// { vdst_new, vsrc_new } llvm.amdgcn.permlane16.swap <vdst_old> <vsrc_old> <fi> <bound_control>
def int_amdgcn_permlane16_swap :
Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;

// { vdst_new, vsrc_new } llvm.amdgcn.permlane32.swap <vdst_old> <vsrc_old> <fi> <bound_control>
def int_amdgcn_permlane32_swap :
Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
Expand Down
23 changes: 22 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -372,10 +372,23 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
"Additional instructions for GFX940+"
>;

def FeaturePermlane16Swap : SubtargetFeature<"permlane16-swap",
"HasPermlane16Swap",
"true",
"Has v_permlane16_swap_b32 instructions"
>;

def FeaturePermlane32Swap : SubtargetFeature<"permlane32-swap",
"HasPermlane32Swap",
"true",
"Has v_permlane32_swap_b32 instructions"
>;

def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
"Additional instructions for GFX950+"
"Additional instructions for GFX950+",
[FeaturePermlane16Swap, FeaturePermlane32Swap]
>;

def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
Expand Down Expand Up @@ -1987,6 +2000,14 @@ def HasGFX950Insts :
Predicate<"Subtarget->hasGFX950Insts()">,
AssemblerPredicate<(all_of FeatureGFX950Insts)>;

def HasPermlane16Swap :
Predicate<"Subtarget->hasPermlane16Swap()">,
AssemblerPredicate<(all_of FeaturePermlane16Swap)>;

def HasPermlane32Swap :
Predicate<"Subtarget->hasPermlane32Swap()">,
AssemblerPredicate<(all_of FeaturePermlane32Swap)>;

def isGFX8GFX9NotGFX940 :
Predicate<"!Subtarget->hasGFX940Insts() &&"
"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
Expand Down
Loading
Loading