Skip to content

Commit eabe6b0

Browse files
committed
[OpenMP] Ensure the actual kernel is annotated with launch bounds
1 parent a422d89 commit eabe6b0

File tree

8 files changed

+1562
-1545
lines changed

8 files changed

+1562
-1545
lines changed

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -644,27 +644,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
644644
// Build the argument list.
645645
bool NeedWrapperFunction =
646646
getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo();
647-
FunctionArgList Args;
648-
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
649-
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
647+
FunctionArgList Args, WrapperArgs;
648+
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs,
649+
WrapperLocalAddrs;
650+
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes,
651+
WrapperVLASizes;
650652
SmallString<256> Buffer;
651653
llvm::raw_svector_ostream Out(Buffer);
652654
Out << CapturedStmtInfo->getHelperName();
653-
if (NeedWrapperFunction)
655+
656+
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
657+
llvm::Function *WrapperF = nullptr;
658+
if (NeedWrapperFunction) {
659+
// Emit the final kernel early to allow attributes to be added by the
660+
// OpenMPI-IR-Builder.
661+
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
662+
/*RegisterCastedArgsOnly=*/true,
663+
CapturedStmtInfo->getHelperName(), Loc);
664+
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
665+
WrapperF =
666+
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
667+
WrapperCGF.CXXThisValue, WrapperFO);
654668
Out << "_debug__";
669+
}
655670
FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false,
656671
Out.str(), Loc);
657-
llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs,
658-
VLASizes, CXXThisValue, FO);
672+
llvm::Function *F = emitOutlinedFunctionPrologue(
673+
*this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO);
659674
CodeGenFunction::OMPPrivateScope LocalScope(*this);
660-
for (const auto &LocalAddrPair : LocalAddrs) {
675+
for (const auto &LocalAddrPair : WrapperLocalAddrs) {
661676
if (LocalAddrPair.second.first) {
662677
LocalScope.addPrivate(LocalAddrPair.second.first,
663678
LocalAddrPair.second.second);
664679
}
665680
}
666681
(void)LocalScope.Privatize();
667-
for (const auto &VLASizePair : VLASizes)
682+
for (const auto &VLASizePair : WrapperVLASizes)
668683
VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
669684
PGO.assignRegionCounters(GlobalDecl(CD), F);
670685
CapturedStmtInfo->EmitBody(*this, CD->getBody());
@@ -673,17 +688,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
673688
if (!NeedWrapperFunction)
674689
return F;
675690

676-
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
677-
/*RegisterCastedArgsOnly=*/true,
678-
CapturedStmtInfo->getHelperName(), Loc);
679-
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
680-
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
681-
Args.clear();
682-
LocalAddrs.clear();
683-
VLASizes.clear();
684-
llvm::Function *WrapperF =
685-
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
686-
WrapperCGF.CXXThisValue, WrapperFO);
691+
// Reverse the order.
692+
WrapperF->removeFromParent();
693+
F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF);
694+
687695
llvm::SmallVector<llvm::Value *, 4> CallArgs;
688696
auto *PI = F->arg_begin();
689697
for (const auto *Arg : Args) {

clang/test/OpenMP/ompx_attributes_codegen.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@
33
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
44
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
55
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
6+
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -dwarf-version=5 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
67
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
8+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -dwarf-version=5 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
79
// expected-no-diagnostics
810

911

1012
// Check that the target attributes are set on the generated kernel
1113
void func() {
12-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16(ptr {{[^,]+}}) #0
13-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}})
14-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #4
14+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
15+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
16+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
1517

1618
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
1719
{}
@@ -35,6 +37,6 @@ void func() {
3537
// NVIDIA: "omp_target_thread_limit"="20"
3638
// NVIDIA: "omp_target_thread_limit"="45"
3739
// NVIDIA: "omp_target_thread_limit"="17"
38-
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
39-
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"maxntidx", i32 45}
40-
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}
40+
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
41+
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
42+
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}

0 commit comments

Comments
 (0)