Skip to content

Commit d5e9691

Browse files
authored
[NVPTX] Add elect.sync Intrinsic (#104780)
This patch adds an NVVM intrinsic and NVPTX codegen for the elect.sync PTX instruction. Lit tests are added in elect.ll and verified through ptxas. PTX ISA reference: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync Signed-off-by: Durgadoss R <[email protected]>
1 parent 55d744e commit d5e9691

File tree

4 files changed

+109
-1
lines changed

4 files changed

+109
-1
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,36 @@ Overview:
251251
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
252252
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
253253

254+
Electing a thread
255+
-----------------
256+
257+
'``llvm.nvvm.elect.sync``'
258+
^^^^^^^^^^^^^^^^^^^^^^^^^^
259+
260+
Syntax:
261+
"""""""
262+
263+
.. code-block:: llvm
264+
265+
declare {i32, i1} @llvm.nvvm.elect.sync(i32 %membermask)
266+
267+
Overview:
268+
"""""""""
269+
270+
The '``@llvm.nvvm.elect.sync``' intrinsic generates the ``elect.sync``
271+
PTX instruction, which elects one predicated active leader thread from
272+
a set of threads specified by ``membermask``. The behavior is undefined
273+
if the executing thread is not in ``membermask``. The laneid of the
274+
elected thread is captured in the i32 return value. The i1 return
275+
value is set to ``True`` for the leader thread and ``False`` for all
276+
the other threads. Election of a leader thread happens deterministically,
277+
i.e. the same leader thread is elected for the same ``membermask``
278+
every time. For more information, refer PTX ISA
279+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync>`_.
280+
254281
Membar/Fences
255282
-------------
256283

257-
258284
'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
259285
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260286

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4706,6 +4706,14 @@ def int_nvvm_match_all_sync_i64p :
47064706
Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
47074707
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
47084708

4709+
//
4710+
// ELECT.SYNC
4711+
//
4712+
// elect.sync dst|pred, membermask
4713+
def int_nvvm_elect_sync :
4714+
DefaultAttrsIntrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty],
4715+
[IntrInaccessibleMemOnly, IntrConvergent]>;
4716+
47094717
//
47104718
// REDUX.SYNC
47114719
//

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,16 @@ defm VOTE_SYNC_ANY : VOTE_SYNC<Int1Regs, "any.pred", int_nvvm_vote_any_sync>;
243243
defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>;
244244
defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>;
245245

246+
// elect.sync
247+
def INT_ELECT_SYNC_I : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask),
248+
"elect.sync \t$dest|$pred, $mask;",
249+
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync imm:$mask))]>,
250+
Requires<[hasPTX<80>, hasSM<90>]>;
251+
def INT_ELECT_SYNC_R : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask),
252+
"elect.sync \t$dest|$pred, $mask;",
253+
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync Int32Regs:$mask))]>,
254+
Requires<[hasPTX<80>, hasSM<90>]>;
255+
246256
multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
247257
Operand ImmOp> {
248258
def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),

llvm/test/CodeGen/NVPTX/elect.ll

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s
3+
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
declare {i32, i1} @llvm.nvvm.elect.sync(i32)
8+
9+
define {i32, i1} @elect_sync(i32 %mask) {
10+
; CHECK-LABEL: elect_sync(
11+
; CHECK: {
12+
; CHECK-NEXT: .reg .pred %p<2>;
13+
; CHECK-NEXT: .reg .b16 %rs<2>;
14+
; CHECK-NEXT: .reg .b32 %r<3>;
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0:
17+
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0];
18+
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
19+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
20+
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
21+
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
22+
; CHECK-NEXT: ret;
23+
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
24+
ret {i32, i1} %val
25+
}
26+
27+
define {i32, i1} @elect_sync_imm() {
28+
; CHECK-LABEL: elect_sync_imm(
29+
; CHECK: {
30+
; CHECK-NEXT: .reg .pred %p<2>;
31+
; CHECK-NEXT: .reg .b16 %rs<2>;
32+
; CHECK-NEXT: .reg .b32 %r<2>;
33+
; CHECK-EMPTY:
34+
; CHECK-NEXT: // %bb.0:
35+
; CHECK-NEXT: elect.sync %r1|%p1, -1;
36+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
37+
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
38+
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
39+
; CHECK-NEXT: ret;
40+
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 u0xffffffff)
41+
ret {i32, i1} %val
42+
}
43+
44+
; When there are two elect.sync's make sure that
45+
; the second one is not optimized away.
46+
define {i32, i1} @elect_sync_twice(i32 %mask) {
47+
; CHECK-LABEL: elect_sync_twice(
48+
; CHECK: {
49+
; CHECK-NEXT: .reg .pred %p<3>;
50+
; CHECK-NEXT: .reg .b16 %rs<2>;
51+
; CHECK-NEXT: .reg .b32 %r<4>;
52+
; CHECK-EMPTY:
53+
; CHECK-NEXT: // %bb.0:
54+
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0];
55+
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
56+
; CHECK-NEXT: elect.sync %r3|%p2, %r1;
57+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
58+
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
59+
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
60+
; CHECK-NEXT: ret;
61+
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
62+
%val2 = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
63+
ret {i32, i1} %val
64+
}

0 commit comments

Comments
 (0)