[LoongArch] lower vector shuffle to shift if possible #132866

tangaac · 2025-03-25T02:49:34Z

No description provided.

llvmbot · 2025-03-25T02:50:07Z

@llvm/pr-subscribers-backend-loongarch

Author: None (tangaac)

Changes

Patch is 62.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132866.diff

6 Files Affected:

(modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+129-6)
(modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.h (+9-1)
(modified) llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td (+50)
(modified) llvm/test/CodeGen/LoongArch/lsx/build-vector.ll (+2-5)
(modified) llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll (+22-96)
(modified) llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll (+51-225)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 8d80a1ba55bcb..269921d80091e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -519,6 +519,121 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
   }
 }
 
+/// Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI
+/// instruction.
+// The funciton matches elements form one of the input vector shuffled to the
+// left or right with zeroable elements 'shifted in'. It handles both the
+// strictly bit-wise element shifts and the byte shfit across an entire 128-bit
+// lane.
+// Mostly copied from X86.
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+                               unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+                               int MaskOffset, const APInt &Zeroable) {
+  int Size = Mask.size();
+  unsigned SizeInBits = Size * ScalarSizeInBits;
+
+  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i < Size; i += Scale)
+      for (int j = 0; j < Shift; ++j)
+        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+          return false;
+
+    return true;
+  };
+
+  auto isSequentialOrUndefInRange = [&](unsigned Pos, unsigned Size, int Low,
+                                        int Step = 1) {
+    for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
+      if (!(Mask[i] == -1 || Mask[i] == Low))
+        return false;
+    return true;
+  };
+
+  auto MatchShift = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = Left ? i + Shift : i;
+      unsigned Low = Left ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      if (!isSequentialOrUndefInRange(Pos, Len, Low + MaskOffset))
+        return -1;
+    }
+
+    int ShiftEltBits = ScalarSizeInBits * Scale;
+    bool ByteShift = ShiftEltBits > 64;
+    Opcode = Left ? (ByteShift ? LoongArchISD::VBSLL : LoongArchISD::VSLLI)
+                  : (ByteShift ? LoongArchISD::VBSRL : LoongArchISD::VSRLI);
+    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+    // Normalize the scale for byte shifts to still produce an i64 element
+    // type.
+    Scale = ByteShift ? Scale / 2 : Scale;
+
+    // We need to round trip through the appropriate type for the shift.
+    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+                        : MVT::getVectorVT(ShiftSVT, Size / Scale);
+    return (int)ShiftAmt;
+  };
+
+  unsigned MaxWidth = 128;
+  for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; ++Shift)
+      for (bool Left : {true, false})
+        if (CheckZeros(Shift, Scale, Left)) {
+          int ShiftAmt = MatchShift(Shift, Scale, Left);
+          if (0 < ShiftAmt)
+            return ShiftAmt;
+        }
+
+  // no match
+  return -1;
+}
+
+/// Lower VECTOR_SHUFFLE as shift (if possible).
+///
+/// For example:
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+/// is lowered to:
+///     (VBSLL_V $v0, $v0, 4)
+///
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+/// is lowered to:
+///     (VSLLI_D $v0, $v0, 32)
+static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG,
+                                          const APInt &Zeroable) {
+  int Size = Mask.size();
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  MVT ShiftVT;
+  SDValue V = V1;
+  unsigned Opcode;
+
+  // Try to match shuffle against V1 shift.
+  int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                     Mask, 0, Zeroable);
+
+  // If V1 failed, try to match shuffle against V2 shift.
+  if (ShiftAmt < 0) {
+    ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                   Mask, Size, Zeroable);
+    V = V2;
+  }
+
+  if (ShiftAmt < 0)
+    return SDValue();
+
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+         "Illegal integer vector type");
+  V = DAG.getBitcast(ShiftVT, V);
+  V = DAG.getNode(Opcode, DL, ShiftVT, V,
+                  DAG.getConstant(ShiftAmt, DL, MVT::i64));
+  return DAG.getBitcast(VT, V);
+}
+
 /// Determine whether a range fits a regular pattern of values.
 /// This function accounts for the possibility of jumping over the End iterator.
 template <typename ValType>
@@ -587,14 +702,12 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
 static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
                                                     ArrayRef<int> Mask, MVT VT,
                                                     SDValue V1, SDValue V2,
-                                                    SelectionDAG &DAG) {
+                                                    SelectionDAG &DAG,
+                                                    const APInt &Zeroable) {
   int Bits = VT.getSizeInBits();
   int EltBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
 
-  APInt KnownUndef, KnownZero;
-  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
-  APInt Zeroable = KnownUndef | KnownZero;
   if (Zeroable.isAllOnes())
     return DAG.getConstant(0, DL, VT);
 
@@ -1056,6 +1169,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
          "Unexpected mask size for shuffle!");
   assert(Mask.size() % 2 == 0 && "Expected even mask size.");
 
+  APInt KnownUndef, KnownZero;
+  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+  APInt Zeroable = KnownUndef | KnownZero;
+
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
@@ -1083,12 +1200,14 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
+  if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
+                                                     Zeroable)))
+    return Result;
   if ((Result =
-           lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+           lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-
   return SDValue();
 }
 
@@ -4997,6 +5116,10 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VANY_NONZERO)
     NODE_NAME_CASE(FRECIPE)
     NODE_NAME_CASE(FRSQRTE)
+    NODE_NAME_CASE(VSLLI)
+    NODE_NAME_CASE(VSRLI)
+    NODE_NAME_CASE(VBSLL)
+    NODE_NAME_CASE(VBSRL)
   }
 #undef NODE_NAME_CASE
   return nullptr;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 002fad0e20759..52d88b9b24a6b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -147,7 +147,15 @@ enum NodeType : unsigned {
 
   // Floating point approximate reciprocal operation
   FRECIPE,
-  FRSQRTE
+  FRSQRTE,
+
+  // Vector logicial left / right shift by immediate
+  VSLLI,
+  VSRLI,
+
+  // Vector byte logicial left / right shift
+  VBSLL,
+  VBSRL
 
   // Intrinsic operations end =============================================
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d2063a8aaae9b..ecbcd29d88aac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -58,6 +58,12 @@ def loongarch_vreplgr2vr: SDNode<"LoongArchISD::VREPLGR2VR", SDT_LoongArchVreplg
 def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>;
 def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>;
 
+def loongarch_vslli : SDNode<"LoongArchISD::VSLLI", SDT_LoongArchV1RUimm>;
+def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>;
+
+def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>;
+def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>;
+
 def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
 def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
@@ -1494,15 +1500,59 @@ def : Pat<(or (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
 def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
           (VXORI_B LSX128:$vj, uimm8:$imm)>;
 
+// VBSLL_V
+def : Pat<(loongarch_vbsll v16i8:$vj, uimm5:$imm), (VBSLL_V v16i8:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v8i16:$vj, uimm5:$imm), (VBSLL_V v8i16:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4i32:$vj, uimm5:$imm), (VBSLL_V v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2i64:$vj, uimm5:$imm), (VBSLL_V v2i64:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4f32:$vj, uimm5:$imm), (VBSLL_V v4f32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2f64:$vj, uimm5:$imm), (VBSLL_V v2f64:$vj,
+                                                       uimm5:$imm)>;
+
+// VBSRL_V
+def : Pat<(loongarch_vbsrl v16i8:$vj, uimm5:$imm), (VBSRL_V v16i8:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v8i16:$vj, uimm5:$imm), (VBSRL_V v8i16:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4i32:$vj, uimm5:$imm), (VBSRL_V v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2i64:$vj, uimm5:$imm), (VBSRL_V v2i64:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4f32:$vj, uimm5:$imm), (VBSRL_V v4f32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2f64:$vj, uimm5:$imm), (VBSRL_V v2f64:$vj,
+                                                       uimm5:$imm)>;
+
 // VSLL[I]_{B/H/W/D}
 defm : PatVrVr<shl, "VSLL">;
 defm : PatShiftVrVr<shl, "VSLL">;
 defm : PatShiftVrUimm<shl, "VSLLI">;
+def : Pat<(loongarch_vslli v16i8:$vj, uimm3:$imm), (VSLLI_B v16i8:$vj,
+                                                       uimm3:$imm)>;
+def : Pat<(loongarch_vslli v8i16:$vj, uimm4:$imm), (VSLLI_H v8i16:$vj,
+                                                       uimm4:$imm)>;
+def : Pat<(loongarch_vslli v4i32:$vj, uimm5:$imm), (VSLLI_W v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vslli v2i64:$vj, uimm6:$imm), (VSLLI_D v2i64:$vj,
+                                                       uimm6:$imm)>;
 
 // VSRL[I]_{B/H/W/D}
 defm : PatVrVr<srl, "VSRL">;
 defm : PatShiftVrVr<srl, "VSRL">;
 defm : PatShiftVrUimm<srl, "VSRLI">;
+def : Pat<(loongarch_vsrli v16i8:$vj, uimm3:$imm), (VSRLI_B v16i8:$vj,
+                                                       uimm3:$imm)>;
+def : Pat<(loongarch_vsrli v8i16:$vj, uimm4:$imm), (VSRLI_H v8i16:$vj,
+                                                       uimm4:$imm)>;
+def : Pat<(loongarch_vsrli v4i32:$vj, uimm5:$imm), (VSRLI_W v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vsrli v2i64:$vj, uimm6:$imm), (VSRLI_D v2i64:$vj,
+                                                       uimm6:$imm)>;
 
 // VSRA[I]_{B/H/W/D}
 defm : PatVrVr<sra, "VSRA">;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index 984b6f3d74866..d84e408cd28be 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,11 +374,8 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI24_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 32
+; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i32>, ptr %src
   %e = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
index b590103511847..48f18a35a38c4 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
@@ -4,10 +4,7 @@
 define <16 x i8> @shuffle_to_vslli_h_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_h_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
   ret <16 x i8> %shuffle
@@ -16,10 +13,7 @@ define <16 x i8> @shuffle_to_vslli_h_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_h_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_h_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.h $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -28,10 +22,7 @@ define <16 x i8> @shuffle_to_vsrli_h_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_w_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_w_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 1, i32 2, i32 16, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 16, i32 12, i32 13, i32 14>
   ret <16 x i8> %shuffle
@@ -40,10 +31,7 @@ define <16 x i8> @shuffle_to_vslli_w_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_w_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_w_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 16, i32 13, i32 14, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -52,11 +40,7 @@ define <16 x i8> @shuffle_to_vsrli_w_8(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_to_vslli_w_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_w_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
   ret <8 x i16> %shuffle
@@ -65,11 +49,7 @@ define <8 x i16> @shuffle_to_vslli_w_16(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_to_vsrli_w_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_w_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 16
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 5, i32 8, i32 7, i32 8>
   ret <8 x i16> %shuffle
@@ -78,10 +58,7 @@ define <8 x i16> @shuffle_to_vsrli_w_16(<8 x i16> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_w_24(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_w_24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
   ret <16 x i8> %shuffle
@@ -90,10 +67,7 @@ define <16 x i8> @shuffle_to_vslli_w_24(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_w_24(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_w_24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 24
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 3, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -102,10 +76,7 @@ define <16 x i8> @shuffle_to_vsrli_w_24(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_d_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <16 x i8> %shuffle
@@ -114,10 +85,7 @@ define <16 x i8> @shuffle_to_vslli_d_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_d_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -126,11 +94,7 @@ define <16 x i8> @shuffle_to_vsrli_d_8(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_to_vslli_d_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_...
[truncated]

SixWeining

Very nice! A couple of questions:

Does some workloads or benchmarks benefit from this change? If yes, could you post the data?
Does this optimization apply to the 256-bits vector?

llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

SixWeining · 2025-03-26T01:48:48Z

About the title, I think it's better to add "if possible" at the end.

And could you write some useful explanation about the implementation in the commit message?

tangaac · 2025-03-28T01:32:30Z

Very nice! A couple of questions:

Does some workloads or benchmarks benefit from this change? If yes, could you post the data?

Does this optimization apply to the 256-bits vector?

18 files under llvm-test-suite benefits from this, with each reducing at least one constant pool.

MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/shared_sha256.s
MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s
MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Ppmd8.s
MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Sha256.s
MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Zip/ZipIn.s
MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Crypto/HmacSha1.s
MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/long_term.s
MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s
MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/readcells.s
MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/long_term.s
MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s
MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench1.s
SingleSource/Benchmarks/Adobe-C++/CMakeFiles/simple_types_constant_folding.dir/simple_types_constant_folding.s
SingleSource/Benchmarks/Adobe-C++/CMakeFiles/simple_types_loop_invariant.dir/simple_types_loop_invariant.s
SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-20000412-6.dir/20000412-6.s
SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-20010224-1.dir/20010224-1.s
SingleSource/UnitTests/CMakeFiles/2002-12-13-MishaTest.dir/2002-12-13-MishaTest.s
SingleSource/UnitTests/Vectorizer/CMakeFiles/find-last.dir/find-last.s

it's not work for LASX now

tangaac · 2025-04-07T09:32:46Z

Ping

heiher · 2025-04-07T14:43:48Z

it's not work for LASX now

Will this optimization be added for LASX in a future patch, or is there a chance it could be included in the current one?

tangaac · 2025-04-08T07:16:33Z

it's not work for LASX now

Will this optimization be added for LASX in a future patch, or is there a chance it could be included in the current one?

It's for lasx now

llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-bit-shift.ll

tangaac · 2025-04-09T07:51:16Z

Here are the files optimized by this pr on llvm-test-suite (lasx):
tangaac/loong-opt-cov-ts@8df0d5c

heiher

LGTM

tangaac requested review from heiher and SixWeining March 25, 2025 02:49

llvmbot added the backend:loongarch label Mar 25, 2025

SixWeining reviewed Mar 26, 2025

View reviewed changes

llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td Outdated Show resolved Hide resolved

llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td Outdated Show resolved Hide resolved

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp Outdated Show resolved Hide resolved

tangaac changed the title ~~[LoongArch] lower vector shuffle to shift~~ [LoongArch] lower vector shuffle to shift if possible Mar 26, 2025

tangaac force-pushed the vector-byte-shift branch from 75de3f4 to ded9f43 Compare April 2, 2025 09:11

SixWeining requested a review from wangleiat April 2, 2025 09:37

heiher reviewed Apr 8, 2025

View reviewed changes

llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-bit-shift.ll Outdated Show resolved Hide resolved

tangaac added 4 commits April 9, 2025 11:04

lower vector shuffle to shift

c3f3350

small change

9232bb2

refactor td code

71f2e29

add lasx support

5e8c1cc

tangaac force-pushed the vector-byte-shift branch from 268039e to 5e8c1cc Compare April 9, 2025 03:31

heiher approved these changes Apr 10, 2025

View reviewed changes

tangaac merged commit 7818e5a into llvm:main Apr 10, 2025
11 checks passed

var-const pushed a commit to ldionne/llvm-project that referenced this pull request Apr 17, 2025

[LoongArch] lower vector shuffle to shift if possible (llvm#132866)

f2fc95d

tangaac deleted the vector-byte-shift branch April 28, 2025 06:26

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[LoongArch] lower vector shuffle to shift if possible #132866

[LoongArch] lower vector shuffle to shift if possible #132866

Uh oh!

tangaac commented Mar 25, 2025

Uh oh!

llvmbot commented Mar 25, 2025

Uh oh!

SixWeining left a comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

SixWeining commented Mar 26, 2025

Uh oh!

tangaac commented Mar 28, 2025

Uh oh!

tangaac commented Apr 7, 2025

Uh oh!

heiher commented Apr 7, 2025

Uh oh!

tangaac commented Apr 8, 2025

Uh oh!

Uh oh!

tangaac commented Apr 9, 2025

Uh oh!

heiher left a comment

Uh oh!

Uh oh!

Uh oh!

[LoongArch] lower vector shuffle to shift if possible #132866

[LoongArch] lower vector shuffle to shift if possible #132866

Uh oh!

Conversation

tangaac commented Mar 25, 2025

Uh oh!

llvmbot commented Mar 25, 2025

Uh oh!

SixWeining left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

SixWeining commented Mar 26, 2025

Uh oh!

tangaac commented Mar 28, 2025

Uh oh!

tangaac commented Apr 7, 2025

Uh oh!

heiher commented Apr 7, 2025

Uh oh!

tangaac commented Apr 8, 2025

Uh oh!

Uh oh!

tangaac commented Apr 9, 2025

Uh oh!

heiher left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!