Skip to content

Commit dde3dc2

Browse files
committed
[CUDA] Added --[no-]cuda-include-ptx=sm_XX|all option.
Currently we always include PTX into the fatbin along with the GPU code.It about doubles the size of the GPU binary we need to carry in the executable. These options allow control inclusion of PTX into GPU binary. This patch does not change the defaults, though we may consider making no-PTX the default in the future. Differential Revision: https://reviews.llvm.org/D45495 llvm-svn: 329737
1 parent 5da361a commit dde3dc2

File tree

4 files changed

+78
-0
lines changed

4 files changed

+78
-0
lines changed

clang/docs/ClangCommandLineReference.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ Compile CUDA code for device only
144144

145145
CUDA GPU architecture (e.g. sm\_35). May be specified more than once.
146146

147+
.. option:: --cuda-include-ptx=<arg>, --no-cuda-include-ptx=<arg>
148+
149+
Include (or not) PTX along with CUDA GPU binary for the given architecture (e.g. sm\_35). Argument may be 'all'. The option may be specified more than once. Default: --cuda-include-ptx=all
150+
147151
.. option:: --cuda-host-only
148152

149153
Compile CUDA code for host only. Has no effect on non-CUDA compilations.

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,10 @@ def cuda_host_only : Flag<["--"], "cuda-host-only">,
546546
def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
547547
HelpText<"Compile CUDA code for both host and device (default). Has no "
548548
"effect on non-CUDA compilations.">;
549+
def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">, Flags<[DriverOption]>,
550+
HelpText<"Include PTX for the follwing GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
551+
def no_cuda_include_ptx_EQ : Joined<["--"], "no-cuda-include-ptx=">, Flags<[DriverOption]>,
552+
HelpText<"Do not include PTX for the follwing GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
549553
def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
550554
HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">;
551555
def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,22 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
377377
C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
378378
}
379379

380+
static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
381+
bool includePTX = true;
382+
for (Arg *A : Args) {
383+
if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
384+
A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
385+
continue;
386+
A->claim();
387+
const StringRef ArchStr = A->getValue();
388+
if (ArchStr == "all" || ArchStr == gpu_arch) {
389+
includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
390+
continue;
391+
}
392+
}
393+
return includePTX;
394+
}
395+
380396
// All inputs to this linker must be from CudaDeviceActions, as we need to look
381397
// at the Inputs' Actions in order to figure out which GPU architecture they
382398
// correspond to.
@@ -404,6 +420,9 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
404420
"Device action expected to have associated a GPU architecture!");
405421
CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
406422

423+
if (II.getType() == types::TY_PP_Asm &&
424+
!shouldIncludePTX(Args, gpu_arch_str))
425+
continue;
407426
// We need to pass an Arch of the form "sm_XX" for cubin files and
408427
// "compute_XX" for ptx.
409428
const char *Arch =

clang/test/Driver/cuda-options.cu

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,48 @@
142142
// RUN: -c %s 2>&1 \
143143
// RUN: | FileCheck -check-prefix ARCHALLERROR %s
144144

145+
146+
// Verify that --[no-]cuda-include-ptx arguments are handled correctly.
147+
// a) by default we're including PTX for all GPUs.
148+
// RUN: %clang -### -target x86_64-linux-gnu \
149+
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
150+
// RUN: -c %s 2>&1 \
151+
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s
152+
153+
// b) --no-cuda-include-ptx=all disables PTX inclusion for all GPUs
154+
// RUN: %clang -### -target x86_64-linux-gnu \
155+
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
156+
// RUN: --no-cuda-include-ptx=all \
157+
// RUN: -c %s 2>&1 \
158+
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM35,NOPTX-SM30 %s
159+
160+
// c) --no-cuda-include-ptx=sm_XX disables PTX inclusion for that GPU only.
161+
// RUN: %clang -### -target x86_64-linux-gnu \
162+
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
163+
// RUN: --no-cuda-include-ptx=sm_35 \
164+
// RUN: -c %s 2>&1 \
165+
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM35,PTX-SM30 %s
166+
// RUN: %clang -### -target x86_64-linux-gnu \
167+
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
168+
// RUN: --no-cuda-include-ptx=sm_30 \
169+
// RUN: -c %s 2>&1 \
170+
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,NOPTX-SM30 %s
171+
172+
// d) --cuda-include-ptx=all overrides preceding --no-cuda-include-ptx=all
173+
// RUN: %clang -### -target x86_64-linux-gnu \
174+
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
175+
// RUN: --no-cuda-include-ptx=all --cuda-include-ptx=all \
176+
// RUN: -c %s 2>&1 \
177+
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s
178+
179+
// e) --cuda-include-ptx=all overrides preceding --no-cuda-include-ptx=sm_XX
180+
// RUN: %clang -### -target x86_64-linux-gnu \
181+
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
182+
// RUN: --no-cuda-include-ptx=sm_30 --cuda-include-ptx=all \
183+
// RUN: -c %s 2>&1 \
184+
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s
185+
186+
145187
// ARCH-SM20: "-cc1"{{.*}}"-target-cpu" "sm_20"
146188
// NOARCH-SM20-NOT: "-cc1"{{.*}}"-target-cpu" "sm_20"
147189
// ARCH-SM30: "-cc1"{{.*}}"-target-cpu" "sm_30"
@@ -236,3 +278,12 @@
236278

237279
// Match no linker.
238280
// NOLINK-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"
281+
282+
// FATBIN-COMMON:fatbinary
283+
// FATBIN-COMMON: "--create" "[[FATBINARY:[^"]*]]"
284+
// FATBIN-COMMON: "--image=profile=sm_30,file=
285+
// PTX-SM30: "--image=profile=compute_30,file=
286+
// NOPTX-SM30-NOT: "--image=profile=compute_30,file=
287+
// FATBIN-COMMON: "--image=profile=sm_35,file=
288+
// PTX-SM35: "--image=profile=compute_35,file=
289+
// NOPTX-SM35-NOT: "--image=profile=compute_35,file=

0 commit comments

Comments
 (0)