-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) Changesinstructions. Compiler can improve analysis for operands of UIToFP/SIToFP instructions Full diff: https://github.com/llvm/llvm-project/pull/85966.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a52064e5417b27..d47c395c012c7a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1088,7 +1088,7 @@ class BoUpSLP {
MinBWs.clear();
ReductionBitWidth = 0;
CastMaxMinBWSizes.reset();
- TruncNodes.clear();
+ ExtraBitWidthNodes.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
PostponedGathers.clear();
@@ -3662,7 +3662,7 @@ class BoUpSLP {
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
/// Indices of the vectorized trunc nodes.
- DenseSet<unsigned> TruncNodes;
+ DenseSet<unsigned> ExtraBitWidthNodes;
};
} // end namespace slpvectorizer
@@ -6595,7 +6595,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
PrevMaxBW),
std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
PrevMinBW));
- TruncNodes.insert(VectorizableTree.size());
+ ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
+ } else if (ShuffleOrOp == Instruction::SIToFP ||
+ ShuffleOrOp == Instruction::UIToFP) {
+ if (ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT) *
+ 2 >=
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+ ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
}
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
@@ -6643,6 +6649,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
+ if (ShuffleOrOp == Instruction::ICmp) {
+ if (ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT) *
+ 2 >=
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+ ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+ if (ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT) *
+ 2 >=
+ DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
+ ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
+ }
return;
}
case Instruction::Select:
@@ -12468,12 +12484,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
TysForDecl.push_back(
FixedVectorType::get(CI->getType(), E->Scalars.size()));
+ auto *CEI = cast<CallInst>(VL0);
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
- CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(I);
OpVecs.push_back(CEI->getArgOperand(I));
if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
@@ -12486,6 +12502,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ ScalarArg = CEI->getArgOperand(I);
+ if (cast<VectorType>(OpVec->getType())->getElementType() !=
+ ScalarArg->getType()) {
+ auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
+ VecTy->getNumElements());
+ OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
+ }
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
@@ -14213,7 +14236,7 @@ void BoUpSLP::computeMinimumValueSizes() {
bool IsStoreOrInsertElt =
VectorizableTree.front()->getOpcode() == Instruction::Store ||
VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
- if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
+ if ((IsStoreOrInsertElt || UserIgnoreList) && ExtraBitWidthNodes.size() <= 1 &&
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
return;
@@ -14398,11 +14421,11 @@ void BoUpSLP::computeMinimumValueSizes() {
IsTopRoot = false;
IsProfitableToDemoteRoot = true;
- if (TruncNodes.empty()) {
+ if (ExtraBitWidthNodes.empty()) {
NodeIdx = VectorizableTree.size();
} else {
- NodeIdx = *TruncNodes.begin() + 1;
- TruncNodes.erase(TruncNodes.begin());
+ NodeIdx = *ExtraBitWidthNodes.begin();
+ ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
IsTruncRoot = true;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index fc28d7ab4ee746..c640b1ed63cc03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT: [[TMP26:%.*]] = and <2 x i8> [[TMP23]], <i8 -2, i8 -2>
+; CHECK-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 136ab64007732f..668d3c3c8c82c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -10,12 +10,14 @@ define void @test() {
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
-; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
✅ With the latest revision this PR passed the Python code formatter. |
Ping! |
Created using spr 1.3.5
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor
…ructions. Compiler can improve analysis for operands of UIToFP/SIToFP instructions and operands of ICmp instruction. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #85966
I've bisected a miscompile back to this patch.
For input %in1=0x3 and %in2=ffff the input function returns 0, but after slp-vectorizer with this patch is returns 2. |
Thanks for the reproducer, will double check |
Must be fixed in 102a811 |
Yep, thanks |
Compiler can improve analysis for operands of UIToFP/SIToFP instructions
and operands of ICmp instruction.