[AArch64] Prevent unnecessary truncation in bool vector reduce code generation #120096

Il-Capitano · 2024-12-16T15:15:14Z

Prevent unnecessarily truncating results of 128 bit wide vector comparisons to 64 bit wide vector values in boolean vector reduce operations.

llvmbot · 2024-12-16T15:15:49Z

@llvm/pr-subscribers-backend-aarch64

Author: Csanád Hajdú (Il-Capitano)

Changes

Prevent unnecessarily truncating results of 128 bit wide vector comparisons to 64 bit wide vector values in boolean vector reduce operations.

Patch is 34.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120096.diff

4 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+14-5)
(modified) llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/vecreduce-bool.ll (+683-24)
(modified) llvm/test/CodeGen/AArch64/vector-extract-last-active.ll (+24-24)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c19265613c706d..ebfee8f442f365 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15928,11 +15928,20 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
       return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
     }
 
-    // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
-    // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
-    // this element size leads to the best codegen, since e.g. setcc results
-    // might need to be truncated otherwise.
-    EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
+    // Results of setcc operations get widened to 128 bits if their input
+    // operands are 128 bits wide and in case of reduce_and and reduce_or have
+    // at least 4 elements, otherwise vectors that are less than 64 bits get
+    // widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets lowered to
+    // either <4 x i16> or <4 x i32>. Sign extending to this element size leads
+    // to the best codegen, since e.g. setcc results might need to be truncated
+    // otherwise.
+    unsigned ExtendedWidth = 64;
+    if ((ScalarOpcode == ISD::XOR || NumElems >= 4) &&
+        Vec.getOpcode() == ISD::SETCC &&
+        Vec.getOperand(0).getValueSizeInBits() >= 128) {
+      ExtendedWidth = 128;
+    }
+    EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
 
     // any_ext doesn't work with umin/umax, so only use it for uadd.
     unsigned ExtendOp =
diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
index 767ca91a58bb10..f317a7b8083421 100644
--- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
@@ -12,8 +12,7 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) {
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    umaxv h0, v0.8h
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    bic w0, w8, w9
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 58020d28702b2f..55b9162921f88a 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -15,8 +15,15 @@ declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
 declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
 declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
 
-define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v1:
+declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
+
+define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    smov w8, v0.b[0]
@@ -29,8 +36,8 @@ define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v2:
+define i32 @reduce_and_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
@@ -46,8 +53,8 @@ define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v4:
+define i32 @reduce_and_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
@@ -63,8 +70,8 @@ define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v8:
+define i32 @reduce_and_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-NEXT:    uminv b0, v0.8b
@@ -78,8 +85,8 @@ define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v16:
+define i32 @reduce_and_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    uminv b0, v0.16b
@@ -93,8 +100,8 @@ define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v32:
+define i32 @reduce_and_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
@@ -109,8 +116,194 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v1:
+define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    uminv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uminv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uminv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <16 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uminv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    smov w8, v0.b[0]
@@ -123,8 +316,8 @@ define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v2:
+define i32 @reduce_or_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
@@ -140,8 +333,8 @@ define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v4:
+define i32 @reduce_or_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
@@ -157,8 +350,8 @@ define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v8:
+define i32 @reduce_or_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-NEXT:    umaxv b0, v0.8b
@@ -172,8 +365,8 @@ define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v16:
+define i32 @reduce_or_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    umaxv b0, v0.16b
@@ -187,8 +380,8 @@ define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v32:
+define i32 @reduce_or_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
@@ -202,3 +395,469 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
+
+define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    umaxv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    umaxv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <16 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    umaxv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    umaxv s0, v...
[truncated]

…eneration Prevent unnecessarily truncating results of 128 bit wide vector comparisons to 64 bit wide vector values in boolean vector reduce operations.

efriedma-quic · 2024-12-17T21:00:14Z

llvm/test/CodeGen/AArch64/vector-extract-last-active.ll

-; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT:    xtn v2.2s, v1.2d
+; SVE-FIXED-NEXT:    umaxv s1, v1.4s


What's going on here? Going from umaxp to umaxv doesn't seem like an improvement.

In the originally generated code, both the and and umaxp instructions used the result of the xtn above. With my patch the truncation is elided for the reduce_or part, and the result of the cmptst is used directly by the umaxv. Yes, going from umaxp to umaxv is not ideal here, but we're removing the dependency on the result of xtn, which is an improvement. I'm not sure how this might affect performance in this case, but seeing that the result of this umaxv is used 6 instructions after this, and we don't have to wait for the xtn anymore I wouldn't be surprised if there was no difference. I could try to benchmark it to see if there's any difference.

Also, this codegen is specific to the vector.extract.last.active intrinsic (with 64-bit wide elements), which seems to be being actively worked on in #112738 and #118810.

Oh, I see, we got lucky with instruction reuse before... and if we don't get the reuse, S-form umaxv is faster than xtn+umaxp? In that case, I guess this change is okay.

efriedma-quic

LGTM

efriedma-quic · 2024-12-17T23:49:29Z

llvm/test/CodeGen/AArch64/vector-extract-last-active.ll

-; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT:    xtn v2.2s, v1.2d
+; SVE-FIXED-NEXT:    umaxv s1, v1.4s


Oh, I see, we got lucky with instruction reuse before... and if we don't get the reuse, S-form umaxv is faster than xtn+umaxp? In that case, I guess this change is okay.

llvmbot added the backend:AArch64 label Dec 16, 2024

[AArch64] Prevent unnecessary truncation in bool vector reduce code g…

f8a4f60

…eneration Prevent unnecessarily truncating results of 128 bit wide vector comparisons to 64 bit wide vector values in boolean vector reduce operations.

Il-Capitano force-pushed the optimize-vecreduce-bool2 branch from 64e8089 to f8a4f60 Compare December 16, 2024 15:31

davemgreen requested review from lawben, davemgreen, efriedma-quic, david-arm and MDevereau December 17, 2024 19:09

efriedma-quic reviewed Dec 17, 2024

View reviewed changes

efriedma-quic approved these changes Dec 17, 2024

View reviewed changes

davemgreen merged commit 96bb281 into llvm:main Dec 18, 2024
8 checks passed

Il-Capitano deleted the optimize-vecreduce-bool2 branch December 18, 2024 12:52

This was referenced Jun 2, 2025

[MTE] [NFC] use vector to collect globals to tag (#120283) #142329

Closed

[MTE] [NFC] use vector to collect globals to tag (#120283) #142330

Draft

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Prevent unnecessary truncation in bool vector reduce code generation #120096

[AArch64] Prevent unnecessary truncation in bool vector reduce code generation #120096

Uh oh!

Il-Capitano commented Dec 16, 2024

Uh oh!

llvmbot commented Dec 16, 2024

Uh oh!

efriedma-quic Dec 17, 2024

Uh oh!

Il-Capitano Dec 17, 2024

Uh oh!

efriedma-quic Dec 17, 2024

Uh oh!

efriedma-quic left a comment

Uh oh!

efriedma-quic Dec 17, 2024

Uh oh!

Uh oh!

Uh oh!

[AArch64] Prevent unnecessary truncation in bool vector reduce code generation #120096

[AArch64] Prevent unnecessary truncation in bool vector reduce code generation #120096

Uh oh!

Conversation

Il-Capitano commented Dec 16, 2024

Uh oh!

llvmbot commented Dec 16, 2024

Uh oh!

efriedma-quic Dec 17, 2024

Choose a reason for hiding this comment

Uh oh!

Il-Capitano Dec 17, 2024

Choose a reason for hiding this comment

Uh oh!

efriedma-quic Dec 17, 2024

Choose a reason for hiding this comment

Uh oh!

efriedma-quic left a comment

Choose a reason for hiding this comment

Uh oh!

efriedma-quic Dec 17, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!