Skip to content

Commit f7f17a7

Browse files
author
Jorge Aparicio
committed
initial support for PTX generation
this PR adds two targets: - `nvptx-unknown-unknown` (32-bit machine model) - `nvptx64-unknown-unknown` (64-bit machine model) that can be used to generate PTX code from Rust source code: ``` $ rustc --target nvptx64-unknown-unknown --emit=asm foo.rs $ head foo.s // // Generated by LLVM NVPTX Back-End // .version 3.2 .target sm_20 .address_size 64 (..) ``` this PR also adds new intrinsics that are equivalent to the following CUDA variables/functions: - `threadIdx.{x,y,z}` - `blockIdx.{x,y,z}` - `blockDim.{x,y,z}` - `gridDim.{x,y,z}` - `__syncthreads` this PR has been tested by writing a kernel that `memcpy`s a chunk of memory to other: ``` rust #![no_core] #[no_mangle] pub fn memcpy_(src: *const f32, dst: *mut f32, n: isize) { unsafe { let i = overflowing_add(overflowing_mul(block_idx_x(), block_dim_x()), thread_idx_x()) as isize; if i < n { *(offset(dst, i) as *mut f32) = *offset(src, i) } } } // undeclared functions are intrinsics // omitted: lang items ``` which translates to: ``` ptx // // Generated by LLVM NVPTX Back-End // .version 3.2 .target sm_20 .address_size 64 // .globl memcpy_ .visible .func memcpy_( .param .b64 memcpy__param_0, .param .b64 memcpy__param_1, .param .b64 memcpy__param_2 ) { .reg .pred %p<2>; .reg .s32 %r<6>; .reg .s64 %rd<8>; mov.u32 %r1, %ctaid.x; ld.param.u64 %rd5, [memcpy__param_2]; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; mad.lo.s32 %r4, %r2, %r1, %r3; cvt.s64.s32 %rd6, %r4; setp.ge.s64 %p1, %rd6, %rd5; @%p1 bra LBB0_2; ld.param.u64 %rd3, [memcpy__param_0]; ld.param.u64 %rd4, [memcpy__param_1]; mul.wide.s32 %rd7, %r4, 4; add.s64 %rd1, %rd3, %rd7; add.s64 %rd2, %rd4, %rd7; ld.u32 %r5, [%rd1]; st.u32 [%rd2], %r5; LBB0_2: ret; } ``` however, this PTX code can't be directly used in a CUDA program because the `memcpy_` function is marked as a "device function" (`.func memcpy_`). Device functions can only be called from other GPU code. To be usable from a CUDA program `memcpy_` should be marked as a "kernel function" (`.entry memcpy_`): ``` diff // .globl memcpy_ -.visible .entry memcpy_( +.visible .func memcpy_( .param .b64 memcpy__param_0, .param .b64 memcpy__param_1, .param .b64 memcpy__param_2 ``` After patching the generated PTX code the kernel became callable from a CUDA program. ### unresolved questions - we need to provide a way to differentiate functions that will be translated to "kernel functions" from the ones that will be translated to "device functions". CUDA uses the `__global__` and `__device__` attributes for this. - we need to provide a way to let the user choose on which memory region [2] variables should be placed. CUDA exposes the `__shared__` and `__constant__` attributes for this. ### FIXMEs - pointer arguments in kernel and device functions should be marked with the `addrspace(1)` attribute in LLVM IR. - compiling a rlib produces an empty archive (no PTX in it) [1]: http://llvm.org/docs/NVPTXUsage.html#kernel-metadata [2]: http://llvm.org/docs/NVPTXUsage.html#id10
1 parent bb4a79b commit f7f17a7

File tree

10 files changed

+135
-3
lines changed

10 files changed

+135
-3
lines changed

src/bootstrap/build/native.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ pub fn llvm(build: &Build, target: &str) {
6363
.out_dir(&dst)
6464
.profile(if build.config.llvm_optimize {"Release"} else {"Debug"})
6565
.define("LLVM_ENABLE_ASSERTIONS", assertions)
66-
.define("LLVM_TARGETS_TO_BUILD", "X86;ARM;AArch64;Mips;PowerPC")
66+
.define("LLVM_TARGETS_TO_BUILD", "X86;ARM;AArch64;Mips;PowerPC;NVPTX")
6767
.define("LLVM_INCLUDE_EXAMPLES", "OFF")
6868
.define("LLVM_INCLUDE_TESTS", "OFF")
6969
.define("LLVM_INCLUDE_DOCS", "OFF")

src/libcore/intrinsics.rs

+18
Original file line numberDiff line numberDiff line change
@@ -602,4 +602,22 @@ extern "rust-intrinsic" {
602602
/// on MSVC it's `*mut [usize; 2]`. For more information see the compiler's
603603
/// source as well as std's catch implementation.
604604
pub fn try(f: fn(*mut u8), data: *mut u8, local_ptr: *mut u8) -> i32;
605+
606+
}
607+
608+
#[cfg(not(stage0))]
609+
extern "rust-intrinsic" {
610+
pub fn thread_idx_x() -> i32;
611+
pub fn thread_idx_y() -> i32;
612+
pub fn thread_idx_z() -> i32;
613+
pub fn block_idx_x() -> i32;
614+
pub fn block_idx_y() -> i32;
615+
pub fn block_idx_z() -> i32;
616+
pub fn block_dim_x() -> i32;
617+
pub fn block_dim_y() -> i32;
618+
pub fn block_dim_z() -> i32;
619+
pub fn grid_dim_x() -> i32;
620+
pub fn grid_dim_y() -> i32;
621+
pub fn grid_dim_z() -> i32;
622+
pub fn syncthreads();
605623
}

src/librustc_back/target/mod.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,9 @@ supported_targets! {
141141
("i586-pc-windows-msvc", i586_pc_windows_msvc),
142142

143143
("le32-unknown-nacl", le32_unknown_nacl),
144-
("asmjs-unknown-emscripten", asmjs_unknown_emscripten)
144+
("asmjs-unknown-emscripten", asmjs_unknown_emscripten),
145+
("nvptx-unknown-unknown", nvptx_unknown_unknown),
146+
("nvptx64-unknown-unknown", nvptx64_unknown_unknown)
145147
}
146148

147149
/// Everything `rustc` knows about how to compile for a specific target.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use super::{Target, TargetOptions};
12+
13+
pub fn target() -> Target {
14+
let opts = TargetOptions {
15+
linker: "".to_string(),
16+
ar: "".to_string(),
17+
18+
cpu: "sm_20".to_string(),
19+
dynamic_linking: false,
20+
executables: false,
21+
no_compiler_rt: true,
22+
allow_asm: false,
23+
.. Default::default()
24+
};
25+
Target {
26+
llvm_target: "nvptx64-unknown-unknown".to_string(),
27+
target_endian: "little".to_string(),
28+
target_pointer_width: "64".to_string(),
29+
target_os: "none".to_string(),
30+
target_env: "".to_string(),
31+
target_vendor: "unknown".to_string(),
32+
data_layout: "e-i64:64-v16:16-v32:32-n16:32:64".to_string(),
33+
arch: "nvptx".to_string(),
34+
options: opts,
35+
}
36+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use super::{Target, TargetOptions};
12+
13+
pub fn target() -> Target {
14+
let opts = TargetOptions {
15+
linker: "".to_string(),
16+
ar: "".to_string(),
17+
18+
cpu: "sm_20".to_string(),
19+
dynamic_linking: false,
20+
executables: false,
21+
no_compiler_rt: true,
22+
allow_asm: false,
23+
.. Default::default()
24+
};
25+
Target {
26+
llvm_target: "nvptx-unknown-unknown".to_string(),
27+
target_endian: "little".to_string(),
28+
target_pointer_width: "32".to_string(),
29+
target_os: "none".to_string(),
30+
target_env: "".to_string(),
31+
target_vendor: "unknown".to_string(),
32+
data_layout: "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64".to_string(),
33+
arch: "nvptx".to_string(),
34+
options: opts,
35+
}
36+
}

src/librustc_llvm/build.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ fn main() {
6868
let host = env::var("HOST").unwrap();
6969
let is_crossed = target != host;
7070

71-
let optional_components = ["x86", "arm", "aarch64", "mips", "powerpc", "pnacl"];
71+
let optional_components = ["x86", "arm", "aarch64", "mips", "powerpc", "pnacl", "nvptx"];
7272

7373
// FIXME: surely we don't need all these components, right? Stuff like mcjit
7474
// or interpreter the compiler itself never uses.

src/librustc_llvm/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -2407,6 +2407,11 @@ pub fn initialize_available_targets() {
24072407
LLVMInitializeMipsTargetMC,
24082408
LLVMInitializeMipsAsmPrinter,
24092409
LLVMInitializeMipsAsmParser);
2410+
init_target!(llvm_component = "nvptx",
2411+
LLVMInitializeNVPTXTargetInfo,
2412+
LLVMInitializeNVPTXTarget,
2413+
LLVMInitializeNVPTXTargetMC,
2414+
LLVMInitializeNVPTXAsmPrinter);
24102415
init_target!(llvm_component = "powerpc",
24112416
LLVMInitializePowerPCTargetInfo,
24122417
LLVMInitializePowerPCTarget,

src/librustc_trans/context.rs

+13
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,19 @@ fn declare_intrinsic(ccx: &CrateContext, key: &str) -> Option<ValueRef> {
10971097
ifn!("llvm.localrecover", fn(i8p, i8p, t_i32) -> i8p);
10981098
ifn!("llvm.x86.seh.recoverfp", fn(i8p, i8p) -> i8p);
10991099

1100+
ifn!("llvm.cuda.syncthreads", fn() -> void);
1101+
ifn!("llvm.nvvm.read.ptx.sreg.tid.x", fn() -> t_i32);
1102+
ifn!("llvm.nvvm.read.ptx.sreg.tid.y", fn() -> t_i32);
1103+
ifn!("llvm.nvvm.read.ptx.sreg.tid.z", fn() -> t_i32);
1104+
ifn!("llvm.nvvm.read.ptx.sreg.ctaid.x", fn() -> t_i32);
1105+
ifn!("llvm.nvvm.read.ptx.sreg.ctaid.y", fn() -> t_i32);
1106+
ifn!("llvm.nvvm.read.ptx.sreg.ctaid.z", fn() -> t_i32);
1107+
ifn!("llvm.nvvm.read.ptx.sreg.ntid.x", fn() -> t_i32);
1108+
ifn!("llvm.nvvm.read.ptx.sreg.ntid.y", fn() -> t_i32);
1109+
ifn!("llvm.nvvm.read.ptx.sreg.ntid.z", fn() -> t_i32);
1110+
ifn!("llvm.nvvm.read.ptx.sreg.nctaid.x", fn() -> t_i32);
1111+
ifn!("llvm.nvvm.read.ptx.sreg.nctaid.y", fn() -> t_i32);
1112+
ifn!("llvm.nvvm.read.ptx.sreg.nctaid.z", fn() -> t_i32);
11001113
ifn!("llvm.assume", fn(i1) -> void);
11011114

11021115
if ccx.sess().opts.debuginfo != NoDebugInfo {

src/librustc_trans/intrinsic.rs

+13
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,19 @@ fn get_simple_intrinsic(ccx: &CrateContext, name: &str) -> Option<ValueRef> {
8989
"roundf32" => "llvm.round.f32",
9090
"roundf64" => "llvm.round.f64",
9191
"assume" => "llvm.assume",
92+
"thread_idx_x" => "llvm.nvvm.read.ptx.sreg.tid.x",
93+
"thread_idx_y" => "llvm.nvvm.read.ptx.sreg.tid.y",
94+
"thread_idx_z" => "llvm.nvvm.read.ptx.sreg.tid.z",
95+
"block_idx_x" => "llvm.nvvm.read.ptx.sreg.ctaid.x",
96+
"block_idx_y" => "llvm.nvvm.read.ptx.sreg.ctaid.y",
97+
"block_idx_z" => "llvm.nvvm.read.ptx.sreg.ctaid.z",
98+
"block_dim_x" => "llvm.nvvm.read.ptx.sreg.ntid.x",
99+
"block_dim_y" => "llvm.nvvm.read.ptx.sreg.ntid.y",
100+
"block_dim_z" => "llvm.nvvm.read.ptx.sreg.ntid.z",
101+
"grid_dim_x" => "llvm.nvvm.read.ptx.sreg.nctaid.x",
102+
"grid_dim_y" => "llvm.nvvm.read.ptx.sreg.nctaid.y",
103+
"grid_dim_z" => "llvm.nvvm.read.ptx.sreg.nctaid.z",
104+
"syncthreads" => "llvm.cuda.syncthreads",
92105
_ => return None
93106
};
94107
Some(ccx.get_intrinsic(&llvm_name))

src/librustc_typeck/check/intrinsic.rs

+9
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,15 @@ pub fn check_intrinsic_type(ccx: &CrateCtxt, it: &hir::ForeignItem) {
298298
(0, vec![tcx.mk_fn_ptr(fn_ty), mut_u8, mut_u8], tcx.types.i32)
299299
}
300300

301+
"thread_idx_x" | "thread_idx_y" | "thread_idx_z" |
302+
"block_idx_x" | "block_idx_y" | "block_idx_z" |
303+
"block_dim_x" | "block_dim_y" | "block_dim_z" |
304+
"grid_dim_x" | "grid_dim_y" | "grid_dim_z" => {
305+
(0, vec![], tcx.types.i32)
306+
}
307+
308+
"syncthreads" => (0, vec![], tcx.mk_nil()),
309+
301310
ref other => {
302311
span_err!(tcx.sess, it.span, E0093,
303312
"unrecognized intrinsic function: `{}`", *other);

0 commit comments

Comments
 (0)