[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966

alexey-bataev · 2024-03-20T17:06:51Z

Compiler can improve analysis for operands of UIToFP/SIToFP instructions
and operands of ICmp instruction.

Created using spr 1.3.5

llvmbot · 2024-03-20T17:07:22Z

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

instructions.

Compiler can improve analysis for operands of UIToFP/SIToFP instructions
and operands of ICmp instruction.

Full diff: https://github.com/llvm/llvm-project/pull/85966.diff

3 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+31-8)
(modified) llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll (+2-2)
(modified) llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll (+6-4)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a52064e5417b27..d47c395c012c7a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1088,7 +1088,7 @@ class BoUpSLP {
     MinBWs.clear();
     ReductionBitWidth = 0;
     CastMaxMinBWSizes.reset();
-    TruncNodes.clear();
+    ExtraBitWidthNodes.clear();
     InstrElementSize.clear();
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
@@ -3662,7 +3662,7 @@ class BoUpSLP {
   std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
 
   /// Indices of the vectorized trunc nodes.
-  DenseSet<unsigned> TruncNodes;
+  DenseSet<unsigned> ExtraBitWidthNodes;
 };
 
 } // end namespace slpvectorizer
@@ -6595,7 +6595,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                 PrevMaxBW),
             std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
                                PrevMinBW));
-        TruncNodes.insert(VectorizableTree.size());
+        ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
+      } else if (ShuffleOrOp == Instruction::SIToFP ||
+                 ShuffleOrOp == Instruction::UIToFP) {
+        if (ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT) *
+                2 >=
+            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+          ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
       }
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
@@ -6643,6 +6649,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       TE->setOperand(1, Right);
       buildTree_rec(Left, Depth + 1, {TE, 0});
       buildTree_rec(Right, Depth + 1, {TE, 1});
+      if (ShuffleOrOp == Instruction::ICmp) {
+        if (ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT) *
+                2 >=
+            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+        if (ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT) *
+                2 >=
+            DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
+          ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
+      }
       return;
     }
     case Instruction::Select:
@@ -12468,12 +12484,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
         TysForDecl.push_back(
             FixedVectorType::get(CI->getType(), E->Scalars.size()));
+      auto *CEI = cast<CallInst>(VL0);
       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
         if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
-          CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(I);
           OpVecs.push_back(CEI->getArgOperand(I));
           if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
@@ -12486,6 +12502,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
           return E->VectorizedValue;
         }
+        ScalarArg = CEI->getArgOperand(I);
+        if (cast<VectorType>(OpVec->getType())->getElementType() !=
+            ScalarArg->getType()) {
+          auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
+                                              VecTy->getNumElements());
+          OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
+        }
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
         if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
@@ -14213,7 +14236,7 @@ void BoUpSLP::computeMinimumValueSizes() {
   bool IsStoreOrInsertElt =
       VectorizableTree.front()->getOpcode() == Instruction::Store ||
       VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
-  if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
+  if ((IsStoreOrInsertElt || UserIgnoreList) && ExtraBitWidthNodes.size() <= 1 &&
       (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
        CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
     return;
@@ -14398,11 +14421,11 @@ void BoUpSLP::computeMinimumValueSizes() {
     IsTopRoot = false;
     IsProfitableToDemoteRoot = true;
 
-    if (TruncNodes.empty()) {
+    if (ExtraBitWidthNodes.empty()) {
       NodeIdx = VectorizableTree.size();
     } else {
-      NodeIdx = *TruncNodes.begin() + 1;
-      TruncNodes.erase(TruncNodes.begin());
+      NodeIdx = *ExtraBitWidthNodes.begin();
+      ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
       IsTruncRoot = true;
     }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index fc28d7ab4ee746..c640b1ed63cc03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT:    [[TMP26:%.*]] = and <2 x i8> [[TMP23]], <i8 -2, i8 -2>
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 136ab64007732f..668d3c3c8c82c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -10,12 +10,14 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 false, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])

github-actions · 2024-03-20T17:10:17Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5

github-actions · 2024-03-22T19:33:16Z

✅ With the latest revision this PR passed the Python code formatter.

Created using spr 1.3.5

alexey-bataev · 2024-03-27T13:40:04Z

Ping!

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Created using spr 1.3.5

alexey-bataev · 2024-04-01T18:42:05Z

Ping!

RKSimon

LGTM with one minor

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Created using spr 1.3.5

…ructions. Compiler can improve analysis for operands of UIToFP/SIToFP instructions and operands of ICmp instruction. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #85966

mikaelholmen · 2024-04-22T14:07:20Z

Hi @alexey-bataev

I've bisected a miscompile back to this patch.
Reproduce with

opt -passes=slp-vectorizer bbi-94784.ll -S -o - -mtriple=aarch64 -slp-threshold=-10 -slp-vectorize-hor=0

bbi-94784.ll.gz

For input %in1=0x3 and %in2=ffff the input function returns 0, but after slp-vectorizer with this patch is returns 2.
I think the problem is that after vectorization the "mul" operates on i16 values but it should really be on i64 otherwise the compares with 196605 will go wrong.

alexey-bataev · 2024-04-22T14:10:22Z

Hi @alexey-bataev

I've bisected a miscompile back to this patch. Reproduce with
opt -passes=slp-vectorizer bbi-94784.ll -S -o - -mtriple=aarch64 -slp-threshold=-10 -slp-vectorize-hor=0
bbi-94784.ll.gz

For input %in1=0x3 and %in2=ffff the input function returns 0, but after slp-vectorizer with this patch is returns 2. I think the problem is that after vectorization the "mul" operates on i16 values but it should really be on i64 otherwise the compares with 196605 will go wrong.

Thanks for the reproducer, will double check

alexey-bataev · 2024-04-22T15:27:43Z

-mtriple=aarch64 -slp-threshold=-10 -slp-vectorize-hor=0

Must be fixed in 102a811

mikaelholmen · 2024-04-23T05:32:19Z

-mtriple=aarch64 -slp-threshold=-10 -slp-vectorize-hor=0

Must be fixed in 102a811

Yep, thanks

[𝘀𝗽𝗿] initial version

cafdc5a

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Mar 20, 2024

alexey-bataev requested a review from RKSimon March 20, 2024 17:07

Rebase, fix formatting

783569c

Created using spr 1.3.5

alexey-bataev changed the title ~~[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp~~ [SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. Mar 21, 2024

Rebase

c913fff

Created using spr 1.3.5

Rebase

6abbd75

Created using spr 1.3.5

RKSimon reviewed Mar 27, 2024

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

alexey-bataev added 2 commits March 28, 2024 14:59

Rebase, address comments

7bc23cf

Created using spr 1.3.5

Rebase

18cda8d

Created using spr 1.3.5

RKSimon approved these changes Apr 3, 2024

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

Rebase

960071b

Created using spr 1.3.5

alexey-bataev merged commit 899855d into main Apr 3, 2024

alexey-bataev deleted the users/alexey-bataev/spr/slpimprove-minbitwidth-analysis-for-operands-of-itofp-and-icmp branch April 3, 2024 19:58

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966

[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966

alexey-bataev commented Mar 20, 2024 •

edited

Loading

llvmbot commented Mar 20, 2024

github-actions bot commented Mar 20, 2024 •

edited

Loading

github-actions bot commented Mar 22, 2024

alexey-bataev commented Mar 27, 2024

alexey-bataev commented Apr 1, 2024

RKSimon left a comment

mikaelholmen commented Apr 22, 2024

alexey-bataev commented Apr 22, 2024

alexey-bataev commented Apr 22, 2024

mikaelholmen commented Apr 23, 2024

[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966

[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions. #85966

Conversation

alexey-bataev commented Mar 20, 2024 • edited Loading

llvmbot commented Mar 20, 2024

github-actions bot commented Mar 20, 2024 • edited Loading

github-actions bot commented Mar 22, 2024

alexey-bataev commented Mar 27, 2024

alexey-bataev commented Apr 1, 2024

RKSimon left a comment

Choose a reason for hiding this comment

mikaelholmen commented Apr 22, 2024

alexey-bataev commented Apr 22, 2024

alexey-bataev commented Apr 22, 2024

mikaelholmen commented Apr 23, 2024

alexey-bataev commented Mar 20, 2024 •

edited

Loading

github-actions bot commented Mar 20, 2024 •

edited

Loading