Skip to content

Commit 899855d

Browse files
[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions.
Compiler can improve analysis for operands of UIToFP/SIToFP instructions and operands of ICmp instruction. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #85966
1 parent 607b4bc commit 899855d

File tree

3 files changed

+48
-16
lines changed

3 files changed

+48
-16
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,7 +1107,7 @@ class BoUpSLP {
11071107
MinBWs.clear();
11081108
ReductionBitWidth = 0;
11091109
CastMaxMinBWSizes.reset();
1110-
TruncNodes.clear();
1110+
ExtraBitWidthNodes.clear();
11111111
InstrElementSize.clear();
11121112
UserIgnoreList = nullptr;
11131113
PostponedGathers.clear();
@@ -3683,8 +3683,9 @@ class BoUpSLP {
36833683
/// type sizes, used in the tree.
36843684
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
36853685

3686-
/// Indices of the vectorized trunc nodes.
3687-
DenseSet<unsigned> TruncNodes;
3686+
/// Indices of the vectorized nodes, which supposed to be the roots of the new
3687+
/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3688+
DenseSet<unsigned> ExtraBitWidthNodes;
36883689
};
36893690

36903691
} // end namespace slpvectorizer
@@ -6612,7 +6613,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
66126613
PrevMaxBW),
66136614
std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
66146615
PrevMinBW));
6615-
TruncNodes.insert(VectorizableTree.size());
6616+
ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6617+
} else if (ShuffleOrOp == Instruction::SIToFP ||
6618+
ShuffleOrOp == Instruction::UIToFP) {
6619+
unsigned NumSignBits =
6620+
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6621+
if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6622+
APInt Mask = DB->getDemandedBits(OpI);
6623+
NumSignBits = std::max(NumSignBits, Mask.countl_zero());
6624+
}
6625+
if (NumSignBits * 2 >=
6626+
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6627+
ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
66166628
}
66176629
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
66186630
ReuseShuffleIndicies);
@@ -6660,6 +6672,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
66606672
TE->setOperand(1, Right);
66616673
buildTree_rec(Left, Depth + 1, {TE, 0});
66626674
buildTree_rec(Right, Depth + 1, {TE, 1});
6675+
if (ShuffleOrOp == Instruction::ICmp) {
6676+
unsigned NumSignBits0 =
6677+
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6678+
if (NumSignBits0 * 2 >=
6679+
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6680+
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
6681+
unsigned NumSignBits1 =
6682+
ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
6683+
if (NumSignBits1 * 2 >=
6684+
DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6685+
ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
6686+
}
66636687
return;
66646688
}
66656689
case Instruction::Select:
@@ -14302,7 +14326,8 @@ void BoUpSLP::computeMinimumValueSizes() {
1430214326
bool IsStoreOrInsertElt =
1430314327
VectorizableTree.front()->getOpcode() == Instruction::Store ||
1430414328
VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14305-
if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
14329+
if ((IsStoreOrInsertElt || UserIgnoreList) &&
14330+
ExtraBitWidthNodes.size() <= 1 &&
1430614331
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
1430714332
CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
1430814333
return;
@@ -14506,16 +14531,21 @@ void BoUpSLP::computeMinimumValueSizes() {
1450614531
IsTopRoot = false;
1450714532
IsProfitableToDemoteRoot = true;
1450814533

14509-
if (TruncNodes.empty()) {
14534+
if (ExtraBitWidthNodes.empty()) {
1451014535
NodeIdx = VectorizableTree.size();
1451114536
} else {
1451214537
unsigned NewIdx = 0;
1451314538
do {
14514-
NewIdx = *TruncNodes.begin() + 1;
14515-
TruncNodes.erase(TruncNodes.begin());
14516-
} while (NewIdx <= NodeIdx && !TruncNodes.empty());
14539+
NewIdx = *ExtraBitWidthNodes.begin();
14540+
ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
14541+
} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
1451714542
NodeIdx = NewIdx;
14518-
IsTruncRoot = true;
14543+
IsTruncRoot = any_of(
14544+
VectorizableTree[NewIdx]->UserTreeIndices, [](const EdgeInfo &EI) {
14545+
return EI.EdgeIdx == 0 &&
14546+
EI.UserTE->getOpcode() == Instruction::ICmp &&
14547+
!EI.UserTE->isAltShuffle();
14548+
});
1451914549
}
1452014550

1452114551
// If the maximum bit width we compute is less than the with of the roots'

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
1919
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
2020
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
2121
; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
22-
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
23-
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
22+
; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
23+
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254>
2424
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
2525
; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
2626
; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ define void @test() {
1010
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
1111
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
1212
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
13-
; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
13+
; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
1414
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
1515
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
16-
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
17-
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
18-
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
16+
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
17+
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
18+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
19+
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
20+
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
1921
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
2022
; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
2123
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])

0 commit comments

Comments
 (0)