Skip to content

Commit 3b12ec6

Browse files
[AArch64] SLP can vectorize frem
When vector library calls are available for frem, given its type and vector length, the SLP vectorizer uses updated costs that amount to a call, matching LoopVectorizer's functionality. This allows 'superword-level' vectorization, which can be converted to a vector lib call by later passes. Add tests that vectorize code that contains 2x double and 4x float frem instructions.
1 parent 982d28b commit 3b12ec6

File tree

2 files changed

+22
-27
lines changed

2 files changed

+22
-27
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8362,9 +8362,20 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
83628362
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
83638363
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
83648364
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
8365-
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
8366-
Op2Info) +
8367-
CommonCost;
8365+
auto VecCost = TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind,
8366+
Op1Info, Op2Info);
8367+
// Some targets can replace frem with vector library calls.
8368+
if (ShuffleOrOp == Instruction::FRem) {
8369+
LibFunc Func;
8370+
if (TLI->getLibFunc(ShuffleOrOp, ScalarTy, Func) &&
8371+
TLI->isFunctionVectorizable(TLI->getName(Func),
8372+
VecTy->getElementCount())) {
8373+
auto VecCallCost = TTI->getCallInstrCost(
8374+
nullptr, VecTy, {ScalarTy, ScalarTy}, CostKind);
8375+
VecCost = std::min(VecCost, VecCallCost);
8376+
}
8377+
}
8378+
return VecCost + CommonCost;
83688379
};
83698380
return GetCostDiff(GetScalarCost, GetVectorCost);
83708381
}

llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,10 @@
66
define void @frem_v2double() {
77
; CHECK-LABEL: define void @frem_v2double() {
88
; CHECK-NEXT: entry:
9-
; CHECK-NEXT: [[A0:%.*]] = load double, ptr @a, align 8
10-
; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
11-
; CHECK-NEXT: [[B0:%.*]] = load double, ptr @a, align 8
12-
; CHECK-NEXT: [[B1:%.*]] = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
13-
; CHECK-NEXT: [[R0:%.*]] = frem double [[A0]], [[B0]]
14-
; CHECK-NEXT: [[R1:%.*]] = frem double [[A1]], [[B1]]
15-
; CHECK-NEXT: store double [[R0]], ptr @a, align 8
16-
; CHECK-NEXT: store double [[R1]], ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
9+
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @a, align 8
10+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @a, align 8
11+
; CHECK-NEXT: [[TMP2:%.*]] = frem <2 x double> [[TMP0]], [[TMP1]]
12+
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @a, align 8
1713
; CHECK-NEXT: ret void
1814
;
1915
entry:
@@ -31,22 +27,10 @@ entry:
3127
define void @frem_v4float() {
3228
; CHECK-LABEL: define void @frem_v4float() {
3329
; CHECK-NEXT: entry:
34-
; CHECK-NEXT: [[A0:%.*]] = load float, ptr @a, align 8
35-
; CHECK-NEXT: [[A1:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
36-
; CHECK-NEXT: [[A2:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
37-
; CHECK-NEXT: [[A3:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
38-
; CHECK-NEXT: [[B0:%.*]] = load float, ptr @a, align 8
39-
; CHECK-NEXT: [[B1:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
40-
; CHECK-NEXT: [[B2:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
41-
; CHECK-NEXT: [[B3:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
42-
; CHECK-NEXT: [[R0:%.*]] = frem float [[A0]], [[B0]]
43-
; CHECK-NEXT: [[R1:%.*]] = frem float [[A1]], [[B1]]
44-
; CHECK-NEXT: [[R2:%.*]] = frem float [[A2]], [[B2]]
45-
; CHECK-NEXT: [[R3:%.*]] = frem float [[A3]], [[B3]]
46-
; CHECK-NEXT: store float [[R0]], ptr @a, align 8
47-
; CHECK-NEXT: store float [[R1]], ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
48-
; CHECK-NEXT: store float [[R2]], ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
49-
; CHECK-NEXT: store float [[R3]], ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
30+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr @a, align 8
31+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @a, align 8
32+
; CHECK-NEXT: [[TMP2:%.*]] = frem <4 x float> [[TMP0]], [[TMP1]]
33+
; CHECK-NEXT: store <4 x float> [[TMP2]], ptr @a, align 8
5034
; CHECK-NEXT: ret void
5135
;
5236
entry:

0 commit comments

Comments
 (0)