Skip to content

Commit d1a4f0c

Browse files
[AArch64] Lower extending sitofp using tbl (llvm#92528)
In a similar manner as in https://reviews.llvm.org/D133494 use `TBL` to place bytes in the *upper* part of `i32` elements and then convert to float using fixed-point `scvtf`, i.e. scvtf Vd.4s, Vn.4s, #24
1 parent 5914a56 commit d1a4f0c

File tree

3 files changed

+313
-3
lines changed

3 files changed

+313
-3
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -8331,7 +8331,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
83318331
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
83328332
return true;
83338333

8334-
if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
8334+
if ((isa<UIToFPInst>(I) || isa<SIToFPInst>(I) || isa<FPToUIInst>(I) ||
8335+
isa<TruncInst>(I)) &&
83358336
TLI->optimizeExtendOrTruncateConversion(
83368337
I, LI->getLoopFor(I->getParent()), *TTI))
83378338
return true;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+35-2
Original file line numberDiff line numberDiff line change
@@ -15869,6 +15869,24 @@ static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
1586915869
return Result;
1587015870
}
1587115871

15872+
static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
15873+
FixedVectorType *DstTy,
15874+
bool IsLittleEndian) {
15875+
auto *SrcTy = cast<FixedVectorType>(Op->getType());
15876+
auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15877+
auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15878+
15879+
SmallVector<int> Mask;
15880+
if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
15881+
!IsLittleEndian, Mask))
15882+
return nullptr;
15883+
15884+
auto *FirstEltZero = Builder.CreateInsertElement(
15885+
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15886+
15887+
return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15888+
}
15889+
1587215890
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
1587315891
IRBuilder<> Builder(TI);
1587415892
SmallVector<Value *> Parts;
@@ -16049,14 +16067,29 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1604916067
Value *ZExt = createTblShuffleForZExt(
1605016068
Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
1605116069
FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16052-
if (!ZExt)
16053-
return false;
16070+
assert(ZExt && "Cannot fail for the i8 to float conversion");
1605416071
auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
1605516072
I->replaceAllUsesWith(UI);
1605616073
I->eraseFromParent();
1605716074
return true;
1605816075
}
1605916076

16077+
auto *SIToFP = dyn_cast<SIToFPInst>(I);
16078+
if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16079+
DstTy->getElementType()->isFloatTy()) {
16080+
IRBuilder<> Builder(I);
16081+
auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16082+
FixedVectorType::getInteger(DstTy),
16083+
Subtarget->isLittleEndian());
16084+
assert(Shuffle && "Cannot fail for the i8 to float conversion");
16085+
auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16086+
auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16087+
auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16088+
I->replaceAllUsesWith(SI);
16089+
I->eraseFromParent();
16090+
return true;
16091+
}
16092+
1606016093
// Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
1606116094
// followed by a truncate lowered to using tbl.4.
1606216095
auto *FPToUI = dyn_cast<FPToUIInst>(I);
+276
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs < %s | FileCheck %s
3+
4+
target triple = "aarch64-linux"
5+
6+
; CHECK-LABEL: .LCPI0_0:
7+
; CHECK-NEXT: .byte 255
8+
; CHECK-NEXT: .byte 255
9+
; CHECK-NEXT: .byte 255
10+
; CHECK-NEXT: .byte 4
11+
; CHECK-NEXT: .byte 255
12+
; CHECK-NEXT: .byte 255
13+
; CHECK-NEXT: .byte 255
14+
; CHECK-NEXT: .byte 5
15+
; CHECK-NEXT: .byte 255
16+
; CHECK-NEXT: .byte 255
17+
; CHECK-NEXT: .byte 255
18+
; CHECK-NEXT: .byte 6
19+
; CHECK-NEXT: .byte 255
20+
; CHECK-NEXT: .byte 255
21+
; CHECK-NEXT: .byte 255
22+
; CHECK-NEXT: .byte 7
23+
; CHECK-NEXT: .LCPI0_1:
24+
; CHECK-NEXT: .byte 255
25+
; CHECK-NEXT: .byte 255
26+
; CHECK-NEXT: .byte 255
27+
; CHECK-NEXT: .byte 0
28+
; CHECK-NEXT: .byte 255
29+
; CHECK-NEXT: .byte 255
30+
; CHECK-NEXT: .byte 255
31+
; CHECK-NEXT: .byte 1
32+
; CHECK-NEXT: .byte 255
33+
; CHECK-NEXT: .byte 255
34+
; CHECK-NEXT: .byte 255
35+
; CHECK-NEXT: .byte 2
36+
; CHECK-NEXT: .byte 255
37+
; CHECK-NEXT: .byte 255
38+
; CHECK-NEXT: .byte 255
39+
; CHECK-NEXT: .byte 3
40+
41+
define void @sitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
42+
; CHECK-LABEL: sitofp_v8i8_to_v8f32:
43+
; CHECK: // %bb.0: // %entry
44+
; CHECK-NEXT: adrp x8, .LCPI0_0
45+
; CHECK-NEXT: adrp x9, .LCPI0_1
46+
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0]
47+
; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI0_1]
48+
; CHECK-NEXT: mov x8, xzr
49+
; CHECK-NEXT: .LBB0_1: // %loop
50+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
51+
; CHECK-NEXT: ldr d2, [x0, x8, lsl #3]
52+
; CHECK-NEXT: add x9, x1, x8, lsl #5
53+
; CHECK-NEXT: add x8, x8, #1
54+
; CHECK-NEXT: cmp x8, #1000
55+
; CHECK-NEXT: tbl v3.16b, { v2.16b }, v0.16b
56+
; CHECK-NEXT: tbl v2.16b, { v2.16b }, v1.16b
57+
; CHECK-NEXT: scvtf v3.4s, v3.4s, #24
58+
; CHECK-NEXT: scvtf v2.4s, v2.4s, #24
59+
; CHECK-NEXT: stp q2, q3, [x9]
60+
; CHECK-NEXT: b.eq .LBB0_1
61+
; CHECK-NEXT: // %bb.2: // %exit
62+
; CHECK-NEXT: ret
63+
entry:
64+
br label %loop
65+
66+
loop:
67+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
68+
%gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv
69+
%l = load <8 x i8>, ptr %gep.src
70+
%conv = sitofp <8 x i8> %l to <8 x float>
71+
%gep.dst = getelementptr inbounds <8 x float>, ptr %dst, i64 %iv
72+
store <8 x float> %conv, ptr %gep.dst
73+
%iv.next = add i64 %iv, 1
74+
%ec = icmp eq i64 %iv.next, 1000
75+
br i1 %ec, label %loop, label %exit
76+
77+
exit:
78+
ret void
79+
}
80+
81+
; CHECK-LABEL: .LCPI1_0:
82+
; CHECK-NEXT: .byte 255
83+
; CHECK-NEXT: .byte 255
84+
; CHECK-NEXT: .byte 255
85+
; CHECK-NEXT: .byte 12
86+
; CHECK-NEXT: .byte 255
87+
; CHECK-NEXT: .byte 255
88+
; CHECK-NEXT: .byte 255
89+
; CHECK-NEXT: .byte 13
90+
; CHECK-NEXT: .byte 255
91+
; CHECK-NEXT: .byte 255
92+
; CHECK-NEXT: .byte 255
93+
; CHECK-NEXT: .byte 14
94+
; CHECK-NEXT: .byte 255
95+
; CHECK-NEXT: .byte 255
96+
; CHECK-NEXT: .byte 255
97+
; CHECK-NEXT: .byte 15
98+
; CHECK-NEXT: .LCPI1_1:
99+
; CHECK-NEXT: .byte 255
100+
; CHECK-NEXT: .byte 255
101+
; CHECK-NEXT: .byte 255
102+
; CHECK-NEXT: .byte 8
103+
; CHECK-NEXT: .byte 255
104+
; CHECK-NEXT: .byte 255
105+
; CHECK-NEXT: .byte 255
106+
; CHECK-NEXT: .byte 9
107+
; CHECK-NEXT: .byte 255
108+
; CHECK-NEXT: .byte 255
109+
; CHECK-NEXT: .byte 255
110+
; CHECK-NEXT: .byte 10
111+
; CHECK-NEXT: .byte 255
112+
; CHECK-NEXT: .byte 255
113+
; CHECK-NEXT: .byte 255
114+
; CHECK-NEXT: .byte 11
115+
; CHECK-NEXT: .LCPI1_2:
116+
; CHECK-NEXT: .byte 255
117+
; CHECK-NEXT: .byte 255
118+
; CHECK-NEXT: .byte 255
119+
; CHECK-NEXT: .byte 4
120+
; CHECK-NEXT: .byte 255
121+
; CHECK-NEXT: .byte 255
122+
; CHECK-NEXT: .byte 255
123+
; CHECK-NEXT: .byte 5
124+
; CHECK-NEXT: .byte 255
125+
; CHECK-NEXT: .byte 255
126+
; CHECK-NEXT: .byte 255
127+
; CHECK-NEXT: .byte 6
128+
; CHECK-NEXT: .byte 255
129+
; CHECK-NEXT: .byte 255
130+
; CHECK-NEXT: .byte 255
131+
; CHECK-NEXT: .byte 7
132+
; CHECK-NEXT: .LCPI1_3:
133+
; CHECK-NEXT: .byte 255
134+
; CHECK-NEXT: .byte 255
135+
; CHECK-NEXT: .byte 255
136+
; CHECK-NEXT: .byte 0
137+
; CHECK-NEXT: .byte 255
138+
; CHECK-NEXT: .byte 255
139+
; CHECK-NEXT: .byte 255
140+
; CHECK-NEXT: .byte 1
141+
; CHECK-NEXT: .byte 255
142+
; CHECK-NEXT: .byte 255
143+
; CHECK-NEXT: .byte 255
144+
; CHECK-NEXT: .byte 2
145+
; CHECK-NEXT: .byte 255
146+
; CHECK-NEXT: .byte 255
147+
; CHECK-NEXT: .byte 255
148+
; CHECK-NEXT: .byte 3
149+
150+
define void @sitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
151+
; CHECK-LABEL: sitofp_v16i8_to_v16f32:
152+
; CHECK: // %bb.0: // %entry
153+
; CHECK-NEXT: adrp x8, .LCPI1_0
154+
; CHECK-NEXT: adrp x9, .LCPI1_1
155+
; CHECK-NEXT: adrp x10, .LCPI1_2
156+
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
157+
; CHECK-NEXT: adrp x8, .LCPI1_3
158+
; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI1_1]
159+
; CHECK-NEXT: ldr q2, [x10, :lo12:.LCPI1_2]
160+
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_3]
161+
; CHECK-NEXT: mov x8, xzr
162+
; CHECK-NEXT: .LBB1_1: // %loop
163+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
164+
; CHECK-NEXT: ldr q4, [x0, x8, lsl #4]
165+
; CHECK-NEXT: add x9, x1, x8, lsl #6
166+
; CHECK-NEXT: add x8, x8, #1
167+
; CHECK-NEXT: cmp x8, #1000
168+
; CHECK-NEXT: tbl v5.16b, { v4.16b }, v0.16b
169+
; CHECK-NEXT: tbl v6.16b, { v4.16b }, v1.16b
170+
; CHECK-NEXT: tbl v7.16b, { v4.16b }, v2.16b
171+
; CHECK-NEXT: tbl v4.16b, { v4.16b }, v3.16b
172+
; CHECK-NEXT: scvtf v5.4s, v5.4s, #24
173+
; CHECK-NEXT: scvtf v6.4s, v6.4s, #24
174+
; CHECK-NEXT: scvtf v7.4s, v7.4s, #24
175+
; CHECK-NEXT: scvtf v4.4s, v4.4s, #24
176+
; CHECK-NEXT: stp q6, q5, [x9, #32]
177+
; CHECK-NEXT: stp q4, q7, [x9]
178+
; CHECK-NEXT: b.eq .LBB1_1
179+
; CHECK-NEXT: // %bb.2: // %exit
180+
; CHECK-NEXT: ret
181+
entry:
182+
br label %loop
183+
184+
loop:
185+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
186+
%gep.src = getelementptr inbounds <16 x i8>, ptr %src, i64 %iv
187+
%l = load <16 x i8>, ptr %gep.src
188+
%conv = sitofp <16 x i8> %l to <16 x float>
189+
%gep.dst = getelementptr inbounds <16 x float>, ptr %dst, i64 %iv
190+
store <16 x float> %conv, ptr %gep.dst
191+
%iv.next = add i64 %iv, 1
192+
%ec = icmp eq i64 %iv.next, 1000
193+
br i1 %ec, label %loop, label %exit
194+
195+
exit:
196+
ret void
197+
}
198+
199+
200+
; Negative tests, currently we don't convert to f16/bf16 via `tbl`.
201+
define void @sitofp_v8i8_to_v8f16(ptr %src, ptr %dst) {
202+
; CHECK-LABEL: sitofp_v8i8_to_v8f16:
203+
; CHECK: // %bb.0: // %entry
204+
; CHECK-NEXT: mov x8, xzr
205+
; CHECK-NEXT: .LBB2_1: // %loop
206+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
207+
; CHECK-NEXT: ldr d0, [x0, x8, lsl #3]
208+
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
209+
; CHECK-NEXT: sshll v1.4s, v0.4h, #0
210+
; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
211+
; CHECK-NEXT: scvtf v1.4s, v1.4s
212+
; CHECK-NEXT: scvtf v0.4s, v0.4s
213+
; CHECK-NEXT: fcvtn v1.4h, v1.4s
214+
; CHECK-NEXT: fcvtn2 v1.8h, v0.4s
215+
; CHECK-NEXT: str q1, [x1, x8, lsl #4]
216+
; CHECK-NEXT: add x8, x8, #1
217+
; CHECK-NEXT: cmp x8, #1000
218+
; CHECK-NEXT: b.eq .LBB2_1
219+
; CHECK-NEXT: // %bb.2: // %exit
220+
; CHECK-NEXT: ret
221+
entry:
222+
br label %loop
223+
224+
loop:
225+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
226+
%gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv
227+
%l = load <8 x i8>, ptr %gep.src
228+
%conv = sitofp <8 x i8> %l to <8 x half>
229+
%gep.dst = getelementptr inbounds <8 x half>, ptr %dst, i64 %iv
230+
store <8 x half> %conv, ptr %gep.dst
231+
%iv.next = add i64 %iv, 1
232+
%ec = icmp eq i64 %iv.next, 1000
233+
br i1 %ec, label %loop, label %exit
234+
235+
exit:
236+
ret void
237+
}
238+
239+
240+
; Negative test, conversion to double with the help of `tbl` not implemented (TODO)
241+
define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
242+
; CHECK-LABEL: sitofp_v2i8_to_v2f64:
243+
; CHECK: // %bb.0: // %entry
244+
; CHECK-NEXT: mov x8, xzr
245+
; CHECK-NEXT: .LBB3_1: // %loop
246+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
247+
; CHECK-NEXT: add x9, x0, x8, lsl #1
248+
; CHECK-NEXT: ldrsb w10, [x9]
249+
; CHECK-NEXT: ldrsb w9, [x9, #1]
250+
; CHECK-NEXT: fmov s0, w10
251+
; CHECK-NEXT: mov v0.s[1], w9
252+
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
253+
; CHECK-NEXT: scvtf v0.2d, v0.2d
254+
; CHECK-NEXT: str q0, [x1, x8, lsl #4]
255+
; CHECK-NEXT: add x8, x8, #1
256+
; CHECK-NEXT: cmp x8, #1000
257+
; CHECK-NEXT: b.eq .LBB3_1
258+
; CHECK-NEXT: // %bb.2: // %exit
259+
; CHECK-NEXT: ret
260+
entry:
261+
br label %loop
262+
263+
loop:
264+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
265+
%gep.src = getelementptr inbounds <2 x i8>, ptr %src, i64 %iv
266+
%l = load <2 x i8>, ptr %gep.src
267+
%conv = sitofp <2 x i8> %l to <2 x double>
268+
%gep.dst = getelementptr inbounds <2 x double>, ptr %dst, i64 %iv
269+
store <2 x double> %conv, ptr %gep.dst
270+
%iv.next = add i64 %iv, 1
271+
%ec = icmp eq i64 %iv.next, 1000
272+
br i1 %ec, label %loop, label %exit
273+
274+
exit:
275+
ret void
276+
}

0 commit comments

Comments
 (0)