Skip to content

Commit 5b78370

Browse files
committed
[CUDA][HIP] Make template implicitly host device
Added option -foffload-implicit-host-device-templates which is off by default. When the option is on, template functions and specializations without host/device attributes have implicit host device attributes. They can be overridden by device template functions with the same signagure. They are emitted on device side only if they are used on device side. This feature is added as an extension. `__has_extension(cuda_implicit_host_device_templates)` can be used to check whether it is enabled. This is to facilitate using standard C++ headers for device. Fixes: #69956 Fixes: SWDEV-428314
1 parent 3c727a9 commit 5b78370

File tree

12 files changed

+241
-4
lines changed

12 files changed

+241
-4
lines changed

clang/include/clang/AST/ASTContext.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
11561156
/// host code.
11571157
llvm::DenseSet<const ValueDecl *> CUDAExternalDeviceDeclODRUsedByHost;
11581158

1159+
/// Keep track of CUDA/HIP implicit host device functions used on device side
1160+
/// in device compilation.
1161+
llvm::DenseSet<const FunctionDecl *> CUDAImplicitHostDeviceFunUsedByDevice;
1162+
11591163
ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
11601164
SelectorTable &sels, Builtin::Context &builtins,
11611165
TranslationUnitKind TUKind);

clang/include/clang/Basic/Features.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ FEATURE(cxx_abi_relative_vtable, LangOpts.CPlusPlus && LangOpts.RelativeCXXABIVT
283283

284284
// CUDA/HIP Features
285285
FEATURE(cuda_noinline_keyword, LangOpts.CUDA)
286+
EXTENSION(cuda_implicit_host_device_templates, LangOpts.CUDA && LangOpts.OffloadImplicitHostDeviceTemplates)
286287

287288
#undef EXTENSION
288289
#undef FEATURE

clang/include/clang/Basic/LangOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA d
268268
LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
269269
LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
270270
LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
271+
LANGOPT(OffloadImplicitHostDeviceTemplates, 1, 0, "assume template functions to be implicitly host device by default for CUDA/HIP")
271272
LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
272273
LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP")
273274
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")

clang/include/clang/Driver/Options.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,6 +1146,14 @@ defm gpu_rdc : BoolFOption<"gpu-rdc",
11461146
"Generate relocatable device code, also known as separate compilation mode">,
11471147
NegFlag<SetFalse>>;
11481148

1149+
defm offload_implicit_host_device_templates :
1150+
BoolFOption<"offload-implicit-host-device-templates",
1151+
LangOpts<"OffloadImplicitHostDeviceTemplates">, DefaultFalse,
1152+
PosFlag<SetTrue, [], [ClangOption, CC1Option],
1153+
"Template functions or specializations without host, device and "
1154+
"global attributes have implicit host device attributes (CUDA/HIP only)">,
1155+
NegFlag<SetFalse>>;
1156+
11491157
def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
11501158
HelpText<"Specify default stream. The default value is 'legacy'. (CUDA/HIP only)">,
11511159
Visibility<[ClangOption, CC1Option]>,

clang/include/clang/Sema/Sema.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13489,6 +13489,10 @@ class Sema final {
1348913489
/// host or device attribute.
1349013490
void CUDASetLambdaAttrs(CXXMethodDecl *Method);
1349113491

13492+
/// Record \p FD if it is a CUDA/HIP implicit host device function used on
13493+
/// device side in device compilation.
13494+
void CUDARecordImplicitHostDeviceFuncUsedByDevice(const FunctionDecl *FD);
13495+
1349213496
/// Finds a function in \p Matches with highest calling priority
1349313497
/// from \p Caller context and erases all functions with lower
1349413498
/// calling priority.

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "CoverageMappingGen.h"
2929
#include "TargetInfo.h"
3030
#include "clang/AST/ASTContext.h"
31+
#include "clang/AST/ASTLambda.h"
3132
#include "clang/AST/CharUnits.h"
3233
#include "clang/AST/DeclCXX.h"
3334
#include "clang/AST/DeclObjC.h"
@@ -3565,6 +3566,14 @@ ConstantAddress CodeGenModule::GetWeakRefReference(const ValueDecl *VD) {
35653566
return ConstantAddress(Aliasee, DeclTy, Alignment);
35663567
}
35673568

3569+
template <typename AttrT> static bool hasImplicitAttr(const ValueDecl *D) {
3570+
if (!D)
3571+
return false;
3572+
if (auto *A = D->getAttr<AttrT>())
3573+
return A->isImplicit();
3574+
return D->isImplicit();
3575+
}
3576+
35683577
void CodeGenModule::EmitGlobal(GlobalDecl GD) {
35693578
const auto *Global = cast<ValueDecl>(GD.getDecl());
35703579

@@ -3586,16 +3595,23 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
35863595
return emitCPUDispatchDefinition(GD);
35873596

35883597
// If this is CUDA, be selective about which declarations we emit.
3598+
// Non-constexpr non-lambda implicit host device functions are not emitted
3599+
// unless they are used on device side.
35893600
if (LangOpts.CUDA) {
35903601
if (LangOpts.CUDAIsDevice) {
3591-
if (!Global->hasAttr<CUDADeviceAttr>() &&
3602+
const auto *FD = dyn_cast<FunctionDecl>(Global);
3603+
if ((!Global->hasAttr<CUDADeviceAttr>() ||
3604+
(LangOpts.OffloadImplicitHostDeviceTemplates && FD &&
3605+
hasImplicitAttr<CUDAHostAttr>(FD) &&
3606+
hasImplicitAttr<CUDADeviceAttr>(FD) && !FD->isConstexpr() &&
3607+
!isLambdaCallOperator(FD) &&
3608+
!getContext().CUDAImplicitHostDeviceFunUsedByDevice.count(FD))) &&
35923609
!Global->hasAttr<CUDAGlobalAttr>() &&
35933610
!Global->hasAttr<CUDAConstantAttr>() &&
35943611
!Global->hasAttr<CUDASharedAttr>() &&
35953612
!Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
35963613
!Global->getType()->isCUDADeviceBuiltinTextureType() &&
3597-
!(LangOpts.HIPStdPar &&
3598-
isa<FunctionDecl>(Global) &&
3614+
!(LangOpts.HIPStdPar && isa<FunctionDecl>(Global) &&
35993615
!Global->hasAttr<CUDAHostAttr>()))
36003616
return;
36013617
} else {

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7395,6 +7395,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
73957395
Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block,
73967396
options::OPT_fno_offload_uniform_block);
73977397

7398+
Args.AddLastArg(CmdArgs, options::OPT_foffload_implicit_host_device_templates,
7399+
options::OPT_fno_offload_implicit_host_device_templates);
7400+
73987401
if (IsCudaDevice || IsHIPDevice) {
73997402
StringRef InlineThresh =
74007403
Args.getLastArgValue(options::OPT_fgpu_inline_threshold_EQ);

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,27 @@ void Sema::checkAllowedCUDAInitializer(VarDecl *VD) {
678678
}
679679
}
680680

681+
void Sema::CUDARecordImplicitHostDeviceFuncUsedByDevice(
682+
const FunctionDecl *Callee) {
683+
FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true);
684+
if (!Caller)
685+
return;
686+
687+
if (!isCUDAImplicitHostDeviceFunction(Callee))
688+
return;
689+
690+
CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
691+
692+
// Record whether an implicit host device function is used on device side.
693+
if (CallerTarget != CFT_Device && CallerTarget != CFT_Global &&
694+
(CallerTarget != CFT_HostDevice ||
695+
(isCUDAImplicitHostDeviceFunction(Caller) &&
696+
!getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(Caller))))
697+
return;
698+
699+
getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.insert(Callee);
700+
}
701+
681702
// With -fcuda-host-device-constexpr, an unattributed constexpr function is
682703
// treated as implicitly __host__ __device__, unless:
683704
// * it is a variadic function (device-side variadic functions are not
@@ -702,6 +723,18 @@ void Sema::maybeAddCUDAHostDeviceAttrs(FunctionDecl *NewD,
702723
return;
703724
}
704725

726+
// If a template function has no host/device/global attributes,
727+
// make it implicitly host device function.
728+
if (getLangOpts().OffloadImplicitHostDeviceTemplates &&
729+
!NewD->hasAttr<CUDAHostAttr>() && !NewD->hasAttr<CUDADeviceAttr>() &&
730+
!NewD->hasAttr<CUDAGlobalAttr>() &&
731+
(NewD->getDescribedFunctionTemplate() ||
732+
NewD->isFunctionTemplateSpecialization())) {
733+
NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
734+
NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
735+
return;
736+
}
737+
705738
if (!getLangOpts().CUDAHostDeviceConstexpr || !NewD->isConstexpr() ||
706739
NewD->isVariadic() || NewD->hasAttr<CUDAHostAttr>() ||
707740
NewD->hasAttr<CUDADeviceAttr>() || NewD->hasAttr<CUDAGlobalAttr>())
@@ -950,7 +983,14 @@ void Sema::checkCUDATargetOverload(FunctionDecl *NewFD,
950983
// HD/global functions "exist" in some sense on both the host and device, so
951984
// should have the same implementation on both sides.
952985
if (NewTarget != OldTarget &&
953-
((NewTarget == CFT_HostDevice) || (OldTarget == CFT_HostDevice) ||
986+
((NewTarget == CFT_HostDevice &&
987+
!(LangOpts.OffloadImplicitHostDeviceTemplates &&
988+
isCUDAImplicitHostDeviceFunction(NewFD) &&
989+
OldTarget == CFT_Device)) ||
990+
(OldTarget == CFT_HostDevice &&
991+
!(LangOpts.OffloadImplicitHostDeviceTemplates &&
992+
isCUDAImplicitHostDeviceFunction(OldFD) &&
993+
NewTarget == CFT_Device)) ||
954994
(NewTarget == CFT_Global) || (OldTarget == CFT_Global)) &&
955995
!IsOverload(NewFD, OldFD, /* UseMemberUsingDeclRules = */ false,
956996
/* ConsiderCudaAttrs = */ false)) {

clang/lib/Sema/SemaExpr.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19096,6 +19096,13 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
1909619096
if (FPT && isUnresolvedExceptionSpec(FPT->getExceptionSpecType()))
1909719097
ResolveExceptionSpec(Loc, FPT);
1909819098

19099+
// A callee could be called by a host function then by a device function.
19100+
// If we only try recording once, we will miss recording the use on device
19101+
// side. Therefore keep trying until it is recorded.
19102+
if (LangOpts.OffloadImplicitHostDeviceTemplates && LangOpts.CUDAIsDevice &&
19103+
!getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(Func))
19104+
CUDARecordImplicitHostDeviceFuncUsedByDevice(Func);
19105+
1909919106
// If this is the first "real" use, act on that.
1910019107
if (OdrUse == OdrUseContext::Used && !Func->isUsed(/*CheckUsedAttr=*/false)) {
1910119108
// Keep track of used but undefined functions.
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu \
2+
// RUN: -foffload-implicit-host-device-templates \
3+
// RUN: -emit-llvm -o - -x hip %s 2>&1 | \
4+
// RUN: FileCheck -check-prefixes=COMM,HOST %s
5+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
6+
// RUN: -target-cpu gfx1100 \
7+
// RUN: -foffload-implicit-host-device-templates \
8+
// RUN: -emit-llvm -o - -x hip %s 2>&1 | \
9+
// RUN: FileCheck -check-prefixes=COMM,DEV %s
10+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
11+
// RUN: -target-cpu gfx1100 \
12+
// RUN: -foffload-implicit-host-device-templates \
13+
// RUN: -emit-llvm -o - -x hip %s 2>&1 | \
14+
// RUN: FileCheck -check-prefixes=DEV-NEG %s
15+
16+
#include "Inputs/cuda.h"
17+
18+
// Implicit host device template not overloaded by device template.
19+
// Used by both device and host function.
20+
// Emitted on both host and device.
21+
22+
// COMM-LABEL: define {{.*}}@_Z20template_no_overloadIiET_S0_(
23+
// COMM: ret i32 1
24+
template<typename T>
25+
T template_no_overload(T x) {
26+
return 1;
27+
}
28+
29+
// Implicit host device template overloaded by device template.
30+
// Used by both device and host function.
31+
// Implicit host device template emitted on host.
32+
// Device template emitted on device.
33+
34+
// COMM-LABEL: define {{.*}}@_Z22template_with_overloadIiET_S0_(
35+
// HOST: ret i32 2
36+
// DEV: ret i32 3
37+
template<typename T>
38+
T template_with_overload(T x) {
39+
return 2;
40+
}
41+
42+
template<typename T>
43+
__device__ T template_with_overload(T x) {
44+
return 3;
45+
}
46+
47+
// Implicit host device template used by host function only.
48+
// Emitted on host only.
49+
// HOST-LABEL: define {{.*}}@_Z21template_used_by_hostIiET_S0_(
50+
// DEV-NEG-NOT: define {{.*}}@_Z21template_used_by_hostIiET_S0_(
51+
// HOST: ret i32 10
52+
template<typename T>
53+
T template_used_by_host(T x) {
54+
return 10;
55+
}
56+
57+
// Implicit host device template indirectly used by host function only.
58+
// Emitted on host only.
59+
// HOST-LABEL: define {{.*}}@_Z32template_indirectly_used_by_hostIiET_S0_(
60+
// DEV-NEG-NOT: define {{.*}}@_Z32template_indirectly_used_by_hostIiET_S0_(
61+
// HOST: ret i32 11
62+
template<typename T>
63+
T template_indirectly_used_by_host(T x) {
64+
return 11;
65+
}
66+
67+
template<typename T>
68+
T template_in_middle_by_host(T x) {
69+
template_indirectly_used_by_host(x);
70+
return 12;
71+
}
72+
73+
// Implicit host device template indirectly used by device function only.
74+
// Emitted on device.
75+
// DEVICE-LABEL: define {{.*}}@_Z34template_indirectly_used_by_deviceIiET_S0_(
76+
// DEVICE: ret i32 21
77+
template<typename T>
78+
T template_indirectly_used_by_device(T x) {
79+
return 21;
80+
}
81+
82+
template<typename T>
83+
T template_in_middle_by_device(T x) {
84+
template_indirectly_used_by_device(x);
85+
return 22;
86+
}
87+
88+
// Implicit host device template indirectly used by host device function only.
89+
// Emitted on host and device.
90+
// COMMON-LABEL: define {{.*}}@_Z39template_indirectly_used_by_host_deviceIiET_S0_(
91+
// COMMON: ret i32 31
92+
template<typename T>
93+
T template_indirectly_used_by_host_device(T x) {
94+
return 31;
95+
}
96+
97+
template<typename T>
98+
T template_in_middle_by_host_device(T x) {
99+
template_indirectly_used_by_host_device(x);
100+
return 32;
101+
}
102+
103+
void host_fun() {
104+
template_no_overload(0);
105+
template_with_overload(0);
106+
template_used_by_host(0);
107+
template_in_middle_by_host(0);
108+
}
109+
110+
__device__ void device_fun() {
111+
template_no_overload(0);
112+
template_with_overload(0);
113+
template_in_middle_by_device(0);
114+
}
115+
116+
__host__ __device__ void host_device_fun() {
117+
template_in_middle_by_host_device(0);
118+
}

clang/test/Lexer/has_extension.cu

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: %clang_cc1 -E -triple x86_64-linux-gnu %s -o - \
2+
// RUN: | FileCheck -check-prefix=NOHDT %s
3+
// RUN: %clang_cc1 -E -triple x86_64-linux-gnu %s -o - \
4+
// RUN: -foffload-implicit-host-device-templates \
5+
// RUN: | FileCheck -check-prefix=HDT %s
6+
7+
// NOHDT: no_implicit_host_device_templates
8+
// HDT: has_implicit_host_device_templates
9+
#if __has_extension(cuda_implicit_host_device_templates)
10+
int has_implicit_host_device_templates();
11+
#else
12+
int no_implicit_host_device_templates();
13+
#endif
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// RUN: %clang_cc1 -isystem %S/Inputs -fsyntax-only %s
2+
// RUN: %clang_cc1 -isystem %S/Inputs -fcuda-is-device -fsyntax-only %s
3+
// RUN: %clang_cc1 -isystem %S/Inputs -foffload-implicit-host-device-templates -fsyntax-only %s
4+
// RUN: %clang_cc1 -isystem %S/Inputs -foffload-implicit-host-device-templates -fcuda-is-device -fsyntax-only %s
5+
6+
#include <cuda.h>
7+
8+
template<typename T>
9+
void tempf(T x) {
10+
}
11+
12+
template<typename T>
13+
__device__ void tempf(T x) {
14+
}
15+
16+
void host_fun() {
17+
tempf(1);
18+
}
19+
20+
__device__ void device_fun() {
21+
tempf(1);
22+
}

0 commit comments

Comments
 (0)