Skip to content

Commit d499a84

Browse files
committed
[OpenMP] Ensure the actual kernel is annotated with launch bounds
1 parent 345c93c commit d499a84

File tree

8 files changed

+1571
-1554
lines changed

8 files changed

+1571
-1554
lines changed

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -639,27 +639,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
639639
// Build the argument list.
640640
bool NeedWrapperFunction =
641641
getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo();
642-
FunctionArgList Args;
643-
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
644-
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
642+
FunctionArgList Args, WrapperArgs;
643+
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs,
644+
WrapperLocalAddrs;
645+
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes,
646+
WrapperVLASizes;
645647
SmallString<256> Buffer;
646648
llvm::raw_svector_ostream Out(Buffer);
647649
Out << CapturedStmtInfo->getHelperName();
648-
if (NeedWrapperFunction)
650+
651+
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
652+
llvm::Function *WrapperF = nullptr;
653+
if (NeedWrapperFunction) {
654+
// Emit the final kernel early to allow attributes to be added by the
655+
// OpenMPI-IR-Builder.
656+
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
657+
/*RegisterCastedArgsOnly=*/true,
658+
CapturedStmtInfo->getHelperName(), Loc);
659+
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
660+
WrapperF =
661+
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
662+
WrapperCGF.CXXThisValue, WrapperFO);
649663
Out << "_debug__";
664+
}
650665
FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false,
651666
Out.str(), Loc);
652-
llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs,
653-
VLASizes, CXXThisValue, FO);
667+
llvm::Function *F = emitOutlinedFunctionPrologue(
668+
*this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO);
654669
CodeGenFunction::OMPPrivateScope LocalScope(*this);
655-
for (const auto &LocalAddrPair : LocalAddrs) {
670+
for (const auto &LocalAddrPair : WrapperLocalAddrs) {
656671
if (LocalAddrPair.second.first) {
657672
LocalScope.addPrivate(LocalAddrPair.second.first,
658673
LocalAddrPair.second.second);
659674
}
660675
}
661676
(void)LocalScope.Privatize();
662-
for (const auto &VLASizePair : VLASizes)
677+
for (const auto &VLASizePair : WrapperVLASizes)
663678
VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
664679
PGO.assignRegionCounters(GlobalDecl(CD), F);
665680
CapturedStmtInfo->EmitBody(*this, CD->getBody());
@@ -668,17 +683,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
668683
if (!NeedWrapperFunction)
669684
return F;
670685

671-
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
672-
/*RegisterCastedArgsOnly=*/true,
673-
CapturedStmtInfo->getHelperName(), Loc);
674-
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
675-
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
676-
Args.clear();
677-
LocalAddrs.clear();
678-
VLASizes.clear();
679-
llvm::Function *WrapperF =
680-
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
681-
WrapperCGF.CXXThisValue, WrapperFO);
686+
// Reverse the order.
687+
WrapperF->removeFromParent();
688+
F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF);
689+
682690
llvm::SmallVector<llvm::Value *, 4> CallArgs;
683691
auto *PI = F->arg_begin();
684692
for (const auto *Arg : Args) {

clang/test/OpenMP/ompx_attributes_codegen.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@
33
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
44
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
55
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
6+
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -dwarf-version=5 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
67
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
8+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -dwarf-version=5 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
79
// expected-no-diagnostics
810

911

1012
// Check that the target attributes are set on the generated kernel
1113
void func() {
12-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16(ptr {{[^,]+}}) #0
13-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}})
14-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #4
14+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
15+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
16+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
1517

1618
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
1719
{}
@@ -35,6 +37,6 @@ void func() {
3537
// NVIDIA: "omp_target_thread_limit"="20"
3638
// NVIDIA: "omp_target_thread_limit"="45"
3739
// NVIDIA: "omp_target_thread_limit"="17"
38-
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
39-
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"maxntidx", i32 45}
40-
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}
40+
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
41+
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
42+
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}

0 commit comments

Comments
 (0)