-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Call the FINI_ARRAY
destructors in the correct order
#71815
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: The AMDGPU backend uses the linker-provided INIT_ARRAY and FINI_ARRAY sections to call all the global constructors in a single kernel. Previously this mistakenly used the same iteration logic for both arrays. The destructors stored in FINI_ARRAY are actually stored in reverse order, so we must start at the end of the array and decrement. This patch makes the neccesarry changes to properly respect priority.
@llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/71815.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
index a13447586bd4ba3..8814d2ca456d8f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
@@ -53,13 +53,22 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
//
// extern "C" void * __init_array_start[];
// extern "C" void * __init_array_end[];
+// extern "C" void * __fini_array_start[];
+// extern "C" void * __fini_array_end[];
//
// using InitCallback = void();
+// using FiniCallback = void(void);
//
// void call_init_array_callbacks() {
// for (auto start = __init_array_start; start != __init_array_end; ++start)
// reinterpret_cast<InitCallback *>(*start)();
// }
+//
+// void call_fini_array_callbacks() {
+// size_t fini_array_size = __fini_array_end - __fini_array_start;
+// for (size_t i = fini_array_size; i > 0; --i)
+// reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
+// }
static void createInitOrFiniCalls(Function &F, bool IsCtor) {
Module &M = *F.getParent();
LLVMContext &C = M.getContext();
@@ -96,15 +105,39 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) {
// for now we just call them with no arguments.
auto *CallBackTy = FunctionType::get(IRB.getVoidTy(), {});
- IRB.CreateCondBr(IRB.CreateICmpNE(Begin, End), LoopBB, ExitBB);
+ Constant *Start = Begin;
+ Constant *Stop = End;
+ // The destructor array must be called in reverse order. Get a constant
+ // expression to the end of the array and iterate backwards instead.
+ if (!IsCtor) {
+ Type *Int64Ty = IntegerType::getInt64Ty(C);
+ auto *Offset = ConstantExpr::getSub(
+ ConstantExpr::getAShr(
+ ConstantExpr::getSub(ConstantExpr::getPtrToInt(End, Int64Ty),
+ ConstantExpr::getPtrToInt(Begin, Int64Ty)),
+ ConstantInt::get(Int64Ty, 3)),
+ ConstantInt::get(Int64Ty, 1));
+ Start = ConstantExpr::getGetElementPtr(
+ ArrayType::get(IRB.getPtrTy(), 0), Begin,
+ ArrayRef<Constant *>({ConstantInt::get(Int64Ty, 0), Offset}),
+ /*InBounds=*/true);
+ Stop = Begin;
+ }
+
+ IRB.CreateCondBr(
+ IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_NE : ICmpInst::ICMP_UGE, Start,
+ Stop),
+ LoopBB, ExitBB);
IRB.SetInsertPoint(LoopBB);
auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr");
auto *CallBack = IRB.CreateLoad(CallBackTy->getPointerTo(F.getAddressSpace()),
CallBackPHI, "callback");
IRB.CreateCall(CallBackTy, CallBack);
- auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, 1, "next");
- auto *EndCmp = IRB.CreateICmpEQ(NewCallBack, End, "end");
- CallBackPHI->addIncoming(Begin, &F.getEntryBlock());
+ auto *NewCallBack =
+ IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, IsCtor ? 1 : -1, "next");
+ auto *EndCmp = IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_ULT,
+ NewCallBack, Stop, "end");
+ CallBackPHI->addIncoming(Start, &F.getEntryBlock());
CallBackPHI->addIncoming(NewCallBack, LoopBB);
IRB.CreateCondBr(EndCmp, ExitBB, LoopBB);
IRB.SetInsertPoint(ExitBB);
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
index a1929a2e8931c11..f9dfa8b4e106656 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@@ -25,8 +25,6 @@ define void @bar() addrspace(1) {
ret void
}
-
-
;.
; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null }, { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }]
; CHECK: @[[LLVM_GLOBAL_DTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }]
@@ -65,13 +63,13 @@ define void @bar() addrspace(1) {
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NEXT: br i1 icmp uge (ptr addrspace(1) getelementptr inbounds ([0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 sub (i64 ashr (i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), i64 3), i64 1)), ptr addrspace(1) @__fini_array_start), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
; CHECK: while.entry:
-; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ getelementptr inbounds ([0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 sub (i64 ashr (i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), i64 3), i64 1)), [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
; CHECK-NEXT: [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: call void [[CALLBACK]]()
-; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
+; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 -1
+; CHECK-NEXT: [[END:%.*]] = icmp ult ptr addrspace(1) [[NEXT]], @__fini_array_start
; CHECK-NEXT: br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
; CHECK: while.end:
; CHECK-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
index 968442182229723..4f228af90c65a00 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -12,20 +12,19 @@
@llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
-
-
-
-
; VISIBILITY: FUNC WEAK PROTECTED {{.*}} amdgcn.device.init
; VISIBILITY: OBJECT WEAK DEFAULT {{.*}} amdgcn.device.init.kd
; VISIBILITY: FUNC WEAK PROTECTED {{.*}} amdgcn.device.fini
; VISIBILITY: OBJECT WEAK DEFAULT {{.*}} amdgcn.device.fini.kd
+
; SECTION: .init_array.1 INIT_ARRAY {{.*}} {{.*}} 000008 00 WA 0 0 8
; SECTION: .fini_array.1 FINI_ARRAY {{.*}} {{.*}} 000008 00 WA 0 0 8
+
; DISABLED-NOT: FUNC GLOBAL PROTECTED {{.*}} amdgcn.device.init
; DISABLED-NOT: OBJECT GLOBAL DEFAULT {{.*}} amdgcn.device.init.kd
; DISABLED-NOT: FUNC GLOBAL PROTECTED {{.*}} amdgcn.device.fini
; DISABLED-NOT: OBJECT GLOBAL DEFAULT {{.*}} amdgcn.device.fini.kd
+
; METADATA: amdhsa.kernels:
; METADATA: .kind: init
; METADATA: .max_flat_workgroup_size: 1
@@ -53,13 +52,6 @@ define internal void @bar() {
; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
;.
-; CHECK-LABEL: define internal void @foo() {
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define internal void @bar() {
-; CHECK-NEXT: ret void
-;
;
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -79,13 +71,13 @@ define internal void @bar() {
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NEXT: br i1 icmp uge (ptr addrspace(1) getelementptr inbounds ([0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 sub (i64 ashr (i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), i64 3), i64 1)), ptr addrspace(1) @__fini_array_start), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
; CHECK: while.entry:
-; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ getelementptr inbounds ([0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 sub (i64 ashr (i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), i64 3), i64 1)), [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
; CHECK-NEXT: [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: call void [[CALLBACK]]()
-; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
+; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 -1
+; CHECK-NEXT: [[END:%.*]] = icmp ult ptr addrspace(1) [[NEXT]], @__fini_array_start
; CHECK-NEXT: br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
; CHECK: while.end:
; CHECK-NEXT: ret void
@@ -93,4 +85,3 @@ define internal void @bar() {
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,1" "device-init" }
; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,1" "device-fini" }
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
index 83bb61d1a632351..75445b99719281c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
@@ -3,10 +3,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf -s - 2>&1 | FileCheck %s -check-prefix=CHECK-VIS
-; UTC_ARGS: --disable
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }, { i32, ptr, ptr } { i32 1, ptr @foo.5, ptr null }]
@llvm.global_dtors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }, { i32, ptr, ptr } { i32 1, ptr @bar.5, ptr null }]
+; UTC_ARGS: --disable
; CHECK: @__init_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
@@ -36,22 +36,6 @@ define internal void @bar.5() {
ret void
}
-; CHECK-LABEL: define internal void @foo() {
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define internal void @bar() {
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define internal void @foo.5() {
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define internal void @bar.5() {
-; CHECK-NEXT: ret void
-;
-;
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
@@ -70,14 +54,13 @@ define internal void @bar.5() {
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NEXT: br i1 icmp uge (ptr addrspace(1) getelementptr inbounds ([0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 sub (i64 ashr (i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), i64 3), i64 1)), ptr addrspace(1) @__fini_array_start), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
; CHECK: while.entry:
-; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ getelementptr inbounds ([0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 sub (i64 ashr (i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), i64 3), i64 1)), [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
; CHECK-NEXT: [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: call void [[CALLBACK]]()
-; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
+; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 -1
+; CHECK-NEXT: [[END:%.*]] = icmp ult ptr addrspace(1) [[NEXT]], @__fini_array_start
; CHECK-NEXT: br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
; CHECK: while.end:
; CHECK-NEXT: ret void
-;
|
This was tested using the support in #71739 |
The requirement is for destructors to be called in reverse order to constructors in pairwise fashion. The only way we have to merge these arrays between translation units is concatenation in link object order. The choice we have is whether to iterate the arrays in order or not, and what order to put elements in the array per TU. Assume without loss of generality that constructors will be iterated from 0 to N. Store constructors and destructors in the same order Concatenates to (ctorA0 ctorA1 ctorB0 ctorB1} {dtorA0 dtorA1 dtorB0 dtorB1} Store constructors and destructors in opposite order (ctorA0 ctorA1 ctorB0 ctorB1) (dtorA1 dtorA0 dtorB1 dtorB0) Therefore either your commit message is wrong:
Or the implementation is wrong. I note that the test cases all use a single array and thus would not notice. Constructor and destructor arrays must be in the same order - object A, then object B - as otherwise concatenation between translation units does not work. |
There is no concatenation here. This is handled entirely by the linker and it sorts all objects in this section by priority order. For destructors we want to call things in reverse priority order so we walk it backwards. If you look at implementations of loaders that walk this section on x64 it will walk the array backwards, we are merely encoding this logic in LLVM-IR. |
This needs tests with multiple translation units. Multiple IR files with appending linkage would also be fine. It also needs tests that don't force the ordering with priority. Destructors need to fire in reverse order of constructors. If your comment is accurate to the implementation:
Then that won't happen for all of the above test cases. |
The tests here are only concerned with the backend. For lowering of these to the
The commit message was not clear enough. The The execution of this should be identical to the handling you would find in |
Constructors without additional constraints execute in somewhat arbitrary order. Destructors, in the C++ sense, do not. Their execution order is the reverse of the arbitrary constructor call order, i.e. exactly fixed. However that does not detract from this patch looking like the right thing. Thank you for changing the commit message to better correlate with the existing implementation. |
Summary: The AMDGPU backend uses the linker-provided INIT_ARRAY and FINI_ARRAY sections to call all the global constructors in a single kernel. Previously this mistakenly used the same iteration logic for both arrays. The destructors stored in FINI_ARRAY are stored in the same order as the ones in the INIT_ARRAY section so we need to traverse it in reverse order. Relanding after the revert in fe7b5e2 using the IR builder interface instead of ConstantExpr.
…#71815) Summary: The AMDGPU backend uses the linker-provided INIT_ARRAY and FINI_ARRAY sections to call all the global constructors in a single kernel. Previously this mistakenly used the same iteration logic for both arrays. The destructors stored in FINI_ARRAY are stored in the same order as the ones in the INIT_ARRAY section so we need to traverse it in reverse order.
…er (llvm#71815)" This reverts commit c1d5865. Introduces a new use of ConstantExpr::getAShr().
…#71815) Summary: The AMDGPU backend uses the linker-provided INIT_ARRAY and FINI_ARRAY sections to call all the global constructors in a single kernel. Previously this mistakenly used the same iteration logic for both arrays. The destructors stored in FINI_ARRAY are stored in the same order as the ones in the INIT_ARRAY section so we need to traverse it in reverse order. Relanding after the revert in fe7b5e2 using the IR builder interface instead of ConstantExpr.
Local branch amd-gfx f34f6bd Merged main:8474bfdd149b into amd-gfx:24c3950d1abc Remote branch main c1d5865 [AMDGPU] Call the `FINI_ARRAY` destructors in the correct order (llvm#71815)
Summary:
The AMDGPU backend uses the linker-provided INIT_ARRAY and FINI_ARRAY
sections to call all the global constructors in a single kernel.
Previously this mistakenly used the same iteration logic for both
arrays. The destructors stored in FINI_ARRAY are stored in the same order as
the ones in the INIT_ARRAY section so we need to traverse it in reverse order.