Skip to content

Commit f13f2f6

Browse files
committed
[NVPTX] Support copysign PTX instruction (llvm#107800)
Lower `fcopysign` SDNodes into `copysign` PTX instructions where possible. See [PTX ISA: 9.7.3.2. Floating Point Instructions: copysign] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign).
1 parent 7a91af4 commit f13f2f6

File tree

4 files changed

+61
-15
lines changed

4 files changed

+61
-15
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -838,8 +838,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
838838
setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
839839
setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
840840
setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
841-
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
842-
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
841+
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
842+
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
843843

844844
// These map to corresponding instructions for f32/f64. f16 must be
845845
// promoted to f32. v2f16 is expanded to f16, which is then promoted

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,20 @@ def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
977977
def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
978978
Float64Regs, int_nvvm_fabs_d>;
979979

980+
//
981+
// copysign
982+
//
983+
984+
def COPYSIGN_F :
985+
NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src0, Float32Regs:$src1),
986+
"copysign.f32 \t$dst, $src0, $src1;",
987+
[(set Float32Regs:$dst, (fcopysign Float32Regs:$src1, Float32Regs:$src0))]>;
988+
989+
def COPYSIGN_D :
990+
NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src0, Float64Regs:$src1),
991+
"copysign.f64 \t$dst, $src0, $src1;",
992+
[(set Float64Regs:$dst, (fcopysign Float64Regs:$src1, Float64Regs:$src0))]>;
993+
980994
//
981995
// Abs, Neg bf16, bf16x2
982996
//

llvm/test/CodeGen/NVPTX/copysign.ll

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
7+
8+
define float @fcopysign_f(float %a, float %b) {
9+
; CHECK-LABEL: fcopysign_f(
10+
; CHECK: {
11+
; CHECK-NEXT: .reg .f32 %f<4>;
12+
; CHECK-EMPTY:
13+
; CHECK-NEXT: // %bb.0:
14+
; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_param_0];
15+
; CHECK-NEXT: ld.param.f32 %f2, [fcopysign_f_param_1];
16+
; CHECK-NEXT: copysign.f32 %f3, %f2, %f1;
17+
; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3;
18+
; CHECK-NEXT: ret;
19+
%val = call float @llvm.copysign.f32(float %a, float %b)
20+
ret float %val
21+
}
22+
23+
define double @fcopysign_d(double %a, double %b) {
24+
; CHECK-LABEL: fcopysign_d(
25+
; CHECK: {
26+
; CHECK-NEXT: .reg .f64 %fd<4>;
27+
; CHECK-EMPTY:
28+
; CHECK-NEXT: // %bb.0:
29+
; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_param_0];
30+
; CHECK-NEXT: ld.param.f64 %fd2, [fcopysign_d_param_1];
31+
; CHECK-NEXT: copysign.f64 %fd3, %fd2, %fd1;
32+
; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3;
33+
; CHECK-NEXT: ret;
34+
%val = call double @llvm.copysign.f64(double %a, double %b)
35+
ret double %val
36+
}
37+
38+
declare float @llvm.copysign.f32(float, float)
39+
declare double @llvm.copysign.f64(double, double)

llvm/test/CodeGen/NVPTX/math-intrins.ll

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,8 @@ define double @round_double(double %a) {
195195
; check the use of 0.5 to implement round
196196
; CHECK-LABEL: round_double(
197197
; CHECK: {
198-
; CHECK-NEXT: .reg .pred %p<4>;
199-
; CHECK-NEXT: .reg .b64 %rd<4>;
200-
; CHECK-NEXT: .reg .f64 %fd<10>;
198+
; CHECK-NEXT: .reg .pred %p<3>;
199+
; CHECK-NEXT: .reg .f64 %fd<8>;
201200
; CHECK-EMPTY:
202201
; CHECK-NEXT: // %bb.0:
203202
; CHECK-NEXT: ld.param.f64 %fd1, [round_double_param_0];
@@ -206,16 +205,10 @@ define double @round_double(double %a) {
206205
; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FE0000000000000;
207206
; CHECK-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
208207
; CHECK-NEXT: selp.f64 %fd5, 0d0000000000000000, %fd4, %p1;
209-
; CHECK-NEXT: abs.f64 %fd6, %fd5;
210-
; CHECK-NEXT: neg.f64 %fd7, %fd6;
211-
; CHECK-NEXT: mov.b64 %rd1, %fd1;
212-
; CHECK-NEXT: shr.u64 %rd2, %rd1, 63;
213-
; CHECK-NEXT: and.b64 %rd3, %rd2, 1;
214-
; CHECK-NEXT: setp.eq.b64 %p2, %rd3, 1;
215-
; CHECK-NEXT: selp.f64 %fd8, %fd7, %fd6, %p2;
216-
; CHECK-NEXT: setp.gt.f64 %p3, %fd2, 0d4330000000000000;
217-
; CHECK-NEXT: selp.f64 %fd9, %fd1, %fd8, %p3;
218-
; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd9;
208+
; CHECK-NEXT: copysign.f64 %fd6, %fd1, %fd5;
209+
; CHECK-NEXT: setp.gt.f64 %p2, %fd2, 0d4330000000000000;
210+
; CHECK-NEXT: selp.f64 %fd7, %fd1, %fd6, %p2;
211+
; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd7;
219212
; CHECK-NEXT: ret;
220213
%b = call double @llvm.round.f64(double %a)
221214
ret double %b

0 commit comments

Comments
 (0)