Skip to content

[AArch64] Avoid NEON ctpop in Streaming-SVE mode #93826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Conversation

sdesmalen-arm
Copy link
Collaborator

The NEON ctpop instruction is also used for scalars.

The NEON ctpop instruction is also used for scalars.
@llvmbot
Copy link
Member

llvmbot commented May 30, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

Changes

The NEON ctpop instruction is also used for scalars.


Patch is 106.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93826.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+6-6)
  • (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll (+1499-833)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 365ef68dcb19b..ac6f1e07c4184 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9571,13 +9571,17 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
           Attribute::NoImplicitFloat))
     return SDValue();
 
-  if (!Subtarget->hasNEON())
+  EVT VT = Op.getValueType();
+  if (VT.isScalableVector() ||
+      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
+
+  if (!Subtarget->isNeonAvailable())
     return SDValue();
 
   bool IsParity = Op.getOpcode() == ISD::PARITY;
   SDValue Val = Op.getOperand(0);
   SDLoc DL(Op);
-  EVT VT = Op.getValueType();
 
   // for i32, general parity function using EORs is more efficient compared to
   // using floating point
@@ -9626,10 +9630,6 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
 
   assert(!IsParity && "ISD::PARITY of vector types not supported");
 
-  if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
-
   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
          "Unexpected type for custom ctpop lowering");
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index f920efeb4892d..f662140327135 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -741,37 +741,63 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #70]
-; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #68]
-; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #66]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
-; NONEON-NOSVE-NEXT:    fmov d2, x10
-; NONEON-NOSVE-NEXT:    fmov d3, x8
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    cnt v2.8b, v2.8b
-; NONEON-NOSVE-NEXT:    cnt v3.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
-; NONEON-NOSVE-NEXT:    uaddlv h2, v2.8b
-; NONEON-NOSVE-NEXT:    uaddlv h3, v3.8b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [sp]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp]
+; NONEON-NOSVE-NEXT:    lsr w13, w9, #1
+; NONEON-NOSVE-NEXT:    lsr w14, w11, #1
+; NONEON-NOSVE-NEXT:    lsr w15, w10, #1
+; NONEON-NOSVE-NEXT:    lsr w16, w12, #1
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w13
+; NONEON-NOSVE-NEXT:    and w13, w14, #0x55555555
+; NONEON-NOSVE-NEXT:    and w14, w15, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w11, w11, w13
+; NONEON-NOSVE-NEXT:    lsr w13, w9, #2
+; NONEON-NOSVE-NEXT:    and w15, w16, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w10, w10, w14
+; NONEON-NOSVE-NEXT:    sub w12, w12, w15
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x33333333
+; NONEON-NOSVE-NEXT:    lsr w14, w11, #2
+; NONEON-NOSVE-NEXT:    lsr w15, w10, #2
+; NONEON-NOSVE-NEXT:    add w9, w9, w13
+; NONEON-NOSVE-NEXT:    lsr w13, w12, #2
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    and w14, w14, #0x33333333
+; NONEON-NOSVE-NEXT:    and w15, w15, #0x33333333
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x33333333
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x33333333
+; NONEON-NOSVE-NEXT:    add w11, w11, w14
+; NONEON-NOSVE-NEXT:    add w10, w10, w15
+; NONEON-NOSVE-NEXT:    add w12, w12, w13
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    add w11, w11, w11, lsr #4
+; NONEON-NOSVE-NEXT:    add w10, w10, w10, lsr #4
+; NONEON-NOSVE-NEXT:    add w12, w12, w12, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    and w11, w11, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    and w12, w12, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w8
+; NONEON-NOSVE-NEXT:    mul w8, w12, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #24
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -788,67 +814,115 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    str d0, [sp, #128]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #135]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #134]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #133]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #132]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #131]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #130]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #129]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #143]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #141]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #139]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    str q0, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #136]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -865,126 +939,219 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #304
-; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
-; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #271]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #270]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #269]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #268]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #267]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #266]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #265]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #264]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #263]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #262]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #261]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #260]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #259]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #258]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #257]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #256]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #287]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #286]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #285]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #284]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #283]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    str q0, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #282]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #280]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #278]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #276]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #274]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #272]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #272]
-; NONEON-NOSVE-NEXT:    add sp, sp, #304
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    ...
[truncated]

@sdesmalen-arm sdesmalen-arm merged commit f484c79 into llvm:main May 31, 2024
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants