Skip to content

Commit e9fa6ff

Browse files
authored
[RISCV] Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to vmax+vnclipu. (#94720)
This pattern is an obscured way to express saturating a signed value into a smaller unsigned value. If (setltu, X, 256) is true, then the value is already in the desired range so we can pick X. If it's false, we select (sext (setgt X, 0)) which is 0 for negative values and all ones for positive values. The all ones value when truncated to the final type will still be all ones like we want.
1 parent 0605e98 commit e9fa6ff

File tree

2 files changed

+283
-2
lines changed

2 files changed

+283
-2
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,7 +1480,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
14801480
if (Subtarget.hasStdExtZbb())
14811481
setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN});
14821482

1483-
if (Subtarget.hasStdExtZbs() && Subtarget.is64Bit())
1483+
if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) ||
1484+
Subtarget.hasStdExtV())
14841485
setTargetDAGCombine(ISD::TRUNCATE);
14851486

14861487
if (Subtarget.hasStdExtZbkb())
@@ -13404,6 +13405,76 @@ static SDValue combineDeMorganOfBoolean(SDNode *N, SelectionDAG &DAG) {
1340413405
return DAG.getNode(ISD::XOR, DL, VT, Logic, DAG.getConstant(1, DL, VT));
1340513406
}
1340613407

13408+
// Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to
13409+
// (vXi8 (trunc (smin (smax X, 0), 255))). This represents saturating a signed
13410+
// value to an unsigned value. This will be lowered to vmax and series of
13411+
// vnclipu instructions later. This can be extended to other truncated types
13412+
// other than i8 by replacing 256 and 255 with the equivalent constants for the
13413+
// type.
13414+
static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) {
13415+
EVT VT = N->getValueType(0);
13416+
SDValue N0 = N->getOperand(0);
13417+
EVT SrcVT = N0.getValueType();
13418+
13419+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13420+
if (!VT.isVector() || !TLI.isTypeLegal(VT) || !TLI.isTypeLegal(SrcVT))
13421+
return SDValue();
13422+
13423+
if (N0.getOpcode() != ISD::VSELECT || !N0.hasOneUse())
13424+
return SDValue();
13425+
13426+
SDValue Cond = N0.getOperand(0);
13427+
SDValue True = N0.getOperand(1);
13428+
SDValue False = N0.getOperand(2);
13429+
13430+
if (Cond.getOpcode() != ISD::SETCC)
13431+
return SDValue();
13432+
13433+
// FIXME: Support the version of this pattern with the select operands
13434+
// swapped.
13435+
ISD::CondCode CCVal = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
13436+
if (CCVal != ISD::SETULT)
13437+
return SDValue();
13438+
13439+
SDValue CondLHS = Cond.getOperand(0);
13440+
SDValue CondRHS = Cond.getOperand(1);
13441+
13442+
if (CondLHS != True)
13443+
return SDValue();
13444+
13445+
unsigned ScalarBits = VT.getScalarSizeInBits();
13446+
13447+
// FIXME: Support other constants.
13448+
ConstantSDNode *CondRHSC = isConstOrConstSplat(CondRHS);
13449+
if (!CondRHSC || CondRHSC->getAPIntValue() != (1ULL << ScalarBits))
13450+
return SDValue();
13451+
13452+
if (False.getOpcode() != ISD::SIGN_EXTEND)
13453+
return SDValue();
13454+
13455+
False = False.getOperand(0);
13456+
13457+
if (False.getOpcode() != ISD::SETCC || False.getOperand(0) != True)
13458+
return SDValue();
13459+
13460+
ConstantSDNode *FalseRHSC = isConstOrConstSplat(False.getOperand(1));
13461+
if (!FalseRHSC || !FalseRHSC->isZero())
13462+
return SDValue();
13463+
13464+
ISD::CondCode CCVal2 = cast<CondCodeSDNode>(False.getOperand(2))->get();
13465+
if (CCVal2 != ISD::SETGT)
13466+
return SDValue();
13467+
13468+
// Emit the signed to unsigned saturation pattern.
13469+
SDLoc DL(N);
13470+
SDValue Max =
13471+
DAG.getNode(ISD::SMAX, DL, SrcVT, True, DAG.getConstant(0, DL, SrcVT));
13472+
SDValue Min =
13473+
DAG.getNode(ISD::SMIN, DL, SrcVT, Max,
13474+
DAG.getConstant((1ULL << ScalarBits) - 1, DL, SrcVT));
13475+
return DAG.getNode(ISD::TRUNCATE, DL, VT, Min);
13476+
}
13477+
1340713478
static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
1340813479
const RISCVSubtarget &Subtarget) {
1340913480
SDValue N0 = N->getOperand(0);
@@ -13424,7 +13495,7 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
1342413495
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Srl);
1342513496
}
1342613497

13427-
return SDValue();
13498+
return combineTruncSelectToSMaxUSat(N, DAG);
1342813499
}
1342913500

1343013501
// Combines two comparison operation and logic operation to one selection
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
3+
4+
define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) {
5+
; CHECK-LABEL: test_v4i16_v4i8:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
8+
; CHECK-NEXT: vmax.vx v8, v8, zero
9+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
10+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
11+
; CHECK-NEXT: ret
12+
%a = icmp sgt <4 x i16> %x, zeroinitializer
13+
%b = sext <4 x i1> %a to <4 x i16>
14+
%c = icmp ult <4 x i16> %x, splat (i16 256)
15+
%d = select <4 x i1> %c, <4 x i16> %x, <4 x i16> %b
16+
%e = trunc <4 x i16> %d to <4 x i8>
17+
ret <4 x i8> %e
18+
}
19+
20+
define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) {
21+
; CHECK-LABEL: test_v4i32_v4i8:
22+
; CHECK: # %bb.0:
23+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
24+
; CHECK-NEXT: vmax.vx v8, v8, zero
25+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
26+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
27+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
28+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
29+
; CHECK-NEXT: ret
30+
%a = icmp sgt <4 x i32> %x, zeroinitializer
31+
%b = sext <4 x i1> %a to <4 x i32>
32+
%c = icmp ult <4 x i32> %x, splat (i32 256)
33+
%d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b
34+
%e = trunc <4 x i32> %d to <4 x i8>
35+
ret <4 x i8> %e
36+
}
37+
38+
define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) {
39+
; CHECK-LABEL: test_v4i64_v4i8:
40+
; CHECK: # %bb.0:
41+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
42+
; CHECK-NEXT: vmax.vx v8, v8, zero
43+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
44+
; CHECK-NEXT: vnclipu.wi v10, v8, 0
45+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
46+
; CHECK-NEXT: vnclipu.wi v8, v10, 0
47+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
48+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
49+
; CHECK-NEXT: ret
50+
%a = icmp sgt <4 x i64> %x, zeroinitializer
51+
%b = sext <4 x i1> %a to <4 x i64>
52+
%c = icmp ult <4 x i64> %x, splat (i64 256)
53+
%d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
54+
%e = trunc <4 x i64> %d to <4 x i8>
55+
ret <4 x i8> %e
56+
}
57+
58+
define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) {
59+
; CHECK-LABEL: test_v4i32_v4i16:
60+
; CHECK: # %bb.0:
61+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
62+
; CHECK-NEXT: vmax.vx v8, v8, zero
63+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
64+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
65+
; CHECK-NEXT: ret
66+
%a = icmp sgt <4 x i32> %x, zeroinitializer
67+
%b = sext <4 x i1> %a to <4 x i32>
68+
%c = icmp ult <4 x i32> %x, splat (i32 65536)
69+
%d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b
70+
%e = trunc <4 x i32> %d to <4 x i16>
71+
ret <4 x i16> %e
72+
}
73+
74+
define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) {
75+
; CHECK-LABEL: test_v4i64_v4i16:
76+
; CHECK: # %bb.0:
77+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
78+
; CHECK-NEXT: vmax.vx v8, v8, zero
79+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
80+
; CHECK-NEXT: vnclipu.wi v10, v8, 0
81+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
82+
; CHECK-NEXT: vnclipu.wi v8, v10, 0
83+
; CHECK-NEXT: ret
84+
%a = icmp sgt <4 x i64> %x, zeroinitializer
85+
%b = sext <4 x i1> %a to <4 x i64>
86+
%c = icmp ult <4 x i64> %x, splat (i64 65536)
87+
%d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
88+
%e = trunc <4 x i64> %d to <4 x i16>
89+
ret <4 x i16> %e
90+
}
91+
92+
define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) {
93+
; CHECK-LABEL: test_v4i64_v4i32:
94+
; CHECK: # %bb.0:
95+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
96+
; CHECK-NEXT: vmax.vx v10, v8, zero
97+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
98+
; CHECK-NEXT: vnclipu.wi v8, v10, 0
99+
; CHECK-NEXT: ret
100+
%a = icmp sgt <4 x i64> %x, zeroinitializer
101+
%b = sext <4 x i1> %a to <4 x i64>
102+
%c = icmp ult <4 x i64> %x, splat (i64 4294967296)
103+
%d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
104+
%e = trunc <4 x i64> %d to <4 x i32>
105+
ret <4 x i32> %e
106+
}
107+
108+
define <vscale x 4 x i8> @test_nxv4i16_nxv4i8(<vscale x 4 x i16> %x) {
109+
; CHECK-LABEL: test_nxv4i16_nxv4i8:
110+
; CHECK: # %bb.0:
111+
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
112+
; CHECK-NEXT: vmax.vx v8, v8, zero
113+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
114+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
115+
; CHECK-NEXT: ret
116+
%a = icmp sgt <vscale x 4 x i16> %x, zeroinitializer
117+
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i16>
118+
%c = icmp ult <vscale x 4 x i16> %x, splat (i16 256)
119+
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i16> %x, <vscale x 4 x i16> %b
120+
%e = trunc <vscale x 4 x i16> %d to <vscale x 4 x i8>
121+
ret <vscale x 4 x i8> %e
122+
}
123+
124+
define <vscale x 4 x i8> @test_nxv4i32_nxv4i8(<vscale x 4 x i32> %x) {
125+
; CHECK-LABEL: test_nxv4i32_nxv4i8:
126+
; CHECK: # %bb.0:
127+
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
128+
; CHECK-NEXT: vmax.vx v8, v8, zero
129+
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
130+
; CHECK-NEXT: vnclipu.wi v10, v8, 0
131+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
132+
; CHECK-NEXT: vnclipu.wi v8, v10, 0
133+
; CHECK-NEXT: ret
134+
%a = icmp sgt <vscale x 4 x i32> %x, zeroinitializer
135+
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i32>
136+
%c = icmp ult <vscale x 4 x i32> %x, splat (i32 256)
137+
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> %b
138+
%e = trunc <vscale x 4 x i32> %d to <vscale x 4 x i8>
139+
ret <vscale x 4 x i8> %e
140+
}
141+
142+
define <vscale x 4 x i8> @test_nxv4i64_nxv4i8(<vscale x 4 x i64> %x) {
143+
; CHECK-LABEL: test_nxv4i64_nxv4i8:
144+
; CHECK: # %bb.0:
145+
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
146+
; CHECK-NEXT: vmax.vx v8, v8, zero
147+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
148+
; CHECK-NEXT: vnclipu.wi v12, v8, 0
149+
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
150+
; CHECK-NEXT: vnclipu.wi v8, v12, 0
151+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
152+
; CHECK-NEXT: vnclipu.wi v8, v8, 0
153+
; CHECK-NEXT: ret
154+
%a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
155+
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
156+
%c = icmp ult <vscale x 4 x i64> %x, splat (i64 256)
157+
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
158+
%e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i8>
159+
ret <vscale x 4 x i8> %e
160+
}
161+
162+
define <vscale x 4 x i16> @test_nxv4i32_nxv4i16(<vscale x 4 x i32> %x) {
163+
; CHECK-LABEL: test_nxv4i32_nxv4i16:
164+
; CHECK: # %bb.0:
165+
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
166+
; CHECK-NEXT: vmax.vx v10, v8, zero
167+
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
168+
; CHECK-NEXT: vnclipu.wi v8, v10, 0
169+
; CHECK-NEXT: ret
170+
%a = icmp sgt <vscale x 4 x i32> %x, zeroinitializer
171+
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i32>
172+
%c = icmp ult <vscale x 4 x i32> %x, splat (i32 65536)
173+
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> %b
174+
%e = trunc <vscale x 4 x i32> %d to <vscale x 4 x i16>
175+
ret <vscale x 4 x i16> %e
176+
}
177+
178+
define <vscale x 4 x i16> @test_nxv4i64_nxv4i16(<vscale x 4 x i64> %x) {
179+
; CHECK-LABEL: test_nxv4i64_nxv4i16:
180+
; CHECK: # %bb.0:
181+
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
182+
; CHECK-NEXT: vmax.vx v8, v8, zero
183+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
184+
; CHECK-NEXT: vnclipu.wi v12, v8, 0
185+
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
186+
; CHECK-NEXT: vnclipu.wi v8, v12, 0
187+
; CHECK-NEXT: ret
188+
%a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
189+
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
190+
%c = icmp ult <vscale x 4 x i64> %x, splat (i64 65536)
191+
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
192+
%e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i16>
193+
ret <vscale x 4 x i16> %e
194+
}
195+
196+
define <vscale x 4 x i32> @test_nxv4i64_nxv4i32(<vscale x 4 x i64> %x) {
197+
; CHECK-LABEL: test_nxv4i64_nxv4i32:
198+
; CHECK: # %bb.0:
199+
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
200+
; CHECK-NEXT: vmax.vx v12, v8, zero
201+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
202+
; CHECK-NEXT: vnclipu.wi v8, v12, 0
203+
; CHECK-NEXT: ret
204+
%a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
205+
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
206+
%c = icmp ult <vscale x 4 x i64> %x, splat (i64 4294967296)
207+
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
208+
%e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i32>
209+
ret <vscale x 4 x i32> %e
210+
}

0 commit comments

Comments
 (0)