[SLP]Fix a crash for reduced values with minbitwidth, which are reused.

alexey-bataev · alexey-bataev · commit 39b2104b4a4e · 2024-01-12T04:49:48.000-08:00
If the reduced values are additionally affected by minbitwidth analysis,
need to cast them to a proper type before doing any math, if they are
reused.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15214,6 +15214,19 @@ class HorizontalReduction {
     assert(IsSupportedHorRdxIdentityOp &&
            "The optimization of matched scalar identity horizontal reductions "
            "must be supported.");
+    auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
+    if (VTy->getElementType() != VL.front()->getType()) {
+      VectorizedValue = Builder.CreateIntCast(
+          VectorizedValue,
+          FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
+          any_of(VL, [&](Value *R) {
+            KnownBits Known = computeKnownBits(
+                R, cast<Instruction>(ReductionOps.front().front())
+                       ->getModule()
+                       ->getDataLayout());
+            return !Known.isNonNegative();
+          }));
+    }
     switch (RdxKind) {
     case RecurKind::Add: {
       // root = mul prev_root, <1, 1, n, 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=skylake < %s | FileCheck %s
+
+define i1 @test(i1 %cmp5.not.31) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i1 [[CMP5_NOT_31:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 poison, i1 false, i1 false, i1 false>, i1 [[CMP5_NOT_31]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], <i32 2, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 0
+; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT:    ret i1 [[CMP_NOT_I_I]]
+;
+entry:
+  %add7.31 = select i1 %cmp5.not.31, i32 0, i32 0
+  %add18 = select i1 false, i32 0, i32 0
+  %add19 = add i32 %add18, %add7.31
+  %add18.1 = select i1 false, i32 0, i32 0
+  %add19.1 = add i32 %add18.1, %add19
+  %add18.4 = select i1 false, i32 0, i32 0
+  %add19.4 = add i32 %add18.4, %add19.1
+  %add19.31 = add i32 %add7.31, %add19.4
+  %0 = and i32 %add19.31, 0
+  %cmp.not.i.i = icmp eq i32 %0, 0
+  ret i1 %cmp.not.i.i
+}