Skip to content

Commit 9520bf8

Browse files
committed
Scalarize i16x2 op when not natively support instead of expanding
1 parent ac430b4 commit 9520bf8

File tree

2 files changed

+55
-33
lines changed

2 files changed

+55
-33
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -621,21 +621,21 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
621621
setOperationAction(ISD::CTLZ, Ty, Legal);
622622
}
623623

624-
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Expand);
625-
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Expand);
626-
setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Expand);
627-
setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Expand);
628-
setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Expand);
624+
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
625+
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
626+
setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
627+
setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
628+
setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
629629
setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
630630
setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
631631

632-
setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Expand);
633-
setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Expand);
634-
setI16x2OperationAction(ISD::AND, MVT::v2i16, Legal, Expand);
635-
setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Expand);
636-
setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Expand);
637-
setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Expand);
638-
setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Expand);
632+
setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
633+
setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
634+
setI16x2OperationAction(ISD::AND, MVT::v2i16, Legal, Custom);
635+
setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
636+
setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
637+
setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
638+
setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
639639

640640
setOperationAction(ISD::ADDC, MVT::i32, Legal);
641641
setOperationAction(ISD::ADDE, MVT::i32, Legal);
@@ -2418,7 +2418,26 @@ SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
24182418
return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
24192419
}
24202420

2421-
2421+
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2422+
SDLoc DL(Op);
2423+
if (Op.getValueType() != MVT::v2i16)
2424+
return Op;
2425+
EVT EltVT = Op.getValueType().getVectorElementType();
2426+
SmallVector<SDValue> VecElements;
2427+
for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2428+
SmallVector<SDValue> ScalarArgs;
2429+
for (int J = 0, NumOp = Op.getNumOperands(); J < NumOp; J++) {
2430+
SDValue Ext =
2431+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op->getOperand(J),
2432+
DAG.getIntPtrConstant(I, DL));
2433+
ScalarArgs.push_back(Ext);
2434+
}
2435+
VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2436+
}
2437+
SDValue V =
2438+
DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2439+
return V;
2440+
}
24222441

24232442
SDValue
24242443
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -2456,6 +2475,19 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24562475
return LowerVAARG(Op, DAG);
24572476
case ISD::VASTART:
24582477
return LowerVASTART(Op, DAG);
2478+
case ISD::ABS:
2479+
case ISD::SMIN:
2480+
case ISD::SMAX:
2481+
case ISD::UMIN:
2482+
case ISD::UMAX:
2483+
case ISD::ADD:
2484+
case ISD::SUB:
2485+
case ISD::AND:
2486+
case ISD::MUL:
2487+
case ISD::SHL:
2488+
case ISD::SREM:
2489+
case ISD::UREM:
2490+
return LowerVectorArith(Op, DAG);
24592491
default:
24602492
llvm_unreachable("Custom lowering not defined for operation");
24612493
}

llvm/test/CodeGen/NVPTX/i16x2-instructions.ll

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,11 @@
99
; RUN: %}
1010
; ## No support for i16x2 instructions
1111
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
12-
; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
13-
; RUN: -verify-machineinstrs \
12+
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
1413
; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOI16x2 %s
1514
; RUN: %if ptxas %{ \
1615
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
17-
; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
18-
; RUN: -verify-machineinstrs \
16+
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
1917
; RUN: | %ptxas-verify -arch=sm_53 \
2018
; RUN: %}
2119

@@ -148,10 +146,8 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
148146
;
149147
; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
150148
; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
151-
; CHECK-NOI16x2-DAG: setp.gt.s16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]];
152-
; CHECK-NOI16x2-DAG: setp.gt.s16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]];
153-
; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]];
154-
; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]];
149+
; CHECK-NOI16x2-DAG: max.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
150+
; CHECK-NOI16x2-DAG: max.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
155151
; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
156152
;
157153
; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
@@ -170,10 +166,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
170166
;
171167
; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
172168
; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
173-
; CHECK-NOI16x2-DAG: setp.gt.u16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]];
174-
; CHECK-NOI16x2-DAG: setp.gt.u16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]];
175-
; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]];
176-
; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]];
169+
; CHECK-NOI16x2-DAG: max.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
170+
; CHECK-NOI16x2-DAG: max.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
177171
; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
178172
;
179173
; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
@@ -192,10 +186,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
192186
;
193187
; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
194188
; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
195-
; CHECK-NOI16x2-DAG: setp.le.s16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]];
196-
; CHECK-NOI16x2-DAG: setp.le.s16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]];
197-
; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]];
198-
; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]];
189+
; CHECK-NOI16x2-DAG: min.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
190+
; CHECK-NOI16x2-DAG: min.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
199191
; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
200192
;
201193
; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
@@ -214,10 +206,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
214206
;
215207
; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
216208
; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
217-
; CHECK-NOI16x2-DAG: setp.le.u16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]];
218-
; CHECK-NOI16x2-DAG: setp.le.u16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]];
219-
; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]];
220-
; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]];
209+
; CHECK-NOI16x2-DAG: min.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
210+
; CHECK-NOI16x2-DAG: min.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
221211
; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
222212
;
223213
; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];

0 commit comments

Comments
 (0)