lower vector shuffle to shift

tangaac · tangaac · commit 1a8d72da6047 · 2025-04-02T16:31:29.000+08:00
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -525,6 +525,121 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
   }
 }
 
+/// Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI
+/// instruction.
+// The funciton matches elements form one of the input vector shuffled to the
+// left or right with zeroable elements 'shifted in'. It handles both the
+// strictly bit-wise element shifts and the byte shfit across an entire 128-bit
+// lane.
+// This is mainly copy from X86.
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+                               unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+                               int MaskOffset, const APInt &Zeroable) {
+  int Size = Mask.size();
+  unsigned SizeInBits = Size * ScalarSizeInBits;
+
+  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i < Size; i += Scale)
+      for (int j = 0; j < Shift; ++j)
+        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+          return false;
+
+    return true;
+  };
+
+  auto isSequentialOrUndefInRange = [&](unsigned Pos, unsigned Size, int Low,
+                                        int Step = 1) {
+    for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
+      if (!(Mask[i] == -1 || Mask[i] == Low))
+        return false;
+    return true;
+  };
+
+  auto MatchShift = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = Left ? i + Shift : i;
+      unsigned Low = Left ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      if (!isSequentialOrUndefInRange(Pos, Len, Low + MaskOffset))
+        return -1;
+    }
+
+    int ShiftEltBits = ScalarSizeInBits * Scale;
+    bool ByteShift = ShiftEltBits > 64;
+    Opcode = Left ? (ByteShift ? LoongArchISD::VBSLL : LoongArchISD::VSLLI)
+                  : (ByteShift ? LoongArchISD::VBSRL : LoongArchISD::VSRLI);
+    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+    // Normalize the scale for byte shifts to still produce an i64 element
+    // type.
+    Scale = ByteShift ? Scale / 2 : Scale;
+
+    // We need to round trip through the appropriate type for the shift.
+    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+                        : MVT::getVectorVT(ShiftSVT, Size / Scale);
+    return (int)ShiftAmt;
+  };
+
+  unsigned MaxWidth = 128;
+  for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; ++Shift)
+      for (bool Left : {true, false})
+        if (CheckZeros(Shift, Scale, Left)) {
+          int ShiftAmt = MatchShift(Shift, Scale, Left);
+          if (0 < ShiftAmt)
+            return ShiftAmt;
+        }
+
+  // no match
+  return -1;
+}
+
+/// Lower VECTOR_SHUFFLE as shift (if possible).
+///
+/// For example:
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+/// is lowered to:
+///     (VBSLL_V $v0, $v0, 4)
+///
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+/// is lowered to:
+///     (VSLLI_D $v0, $v0, 32)
+static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG,
+                                          const APInt &Zeroable) {
+  int Size = Mask.size();
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  MVT ShiftVT;
+  SDValue V = V1;
+  unsigned Opcode;
+
+  // Try to match shuffle against V1 shift.
+  int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                     Mask, 0, Zeroable);
+
+  // If V1 failed, try to match shuffle against V2 shift.
+  if (ShiftAmt < 0) {
+    ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                   Mask, Size, Zeroable);
+    V = V2;
+  }
+
+  if (ShiftAmt < 0)
+    return SDValue();
+
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+         "Illegal integer vector type");
+  V = DAG.getBitcast(ShiftVT, V);
+  V = DAG.getNode(Opcode, DL, ShiftVT, V,
+                  DAG.getConstant(ShiftAmt, DL, MVT::i64));
+  return DAG.getBitcast(VT, V);
+}
+
 /// Determine whether a range fits a regular pattern of values.
 /// This function accounts for the possibility of jumping over the End iterator.
 template <typename ValType>
@@ -593,14 +708,12 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
 static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
                                                     ArrayRef<int> Mask, MVT VT,
                                                     SDValue V1, SDValue V2,
-                                                    SelectionDAG &DAG) {
+                                                    SelectionDAG &DAG,
+                                                    const APInt &Zeroable) {
   int Bits = VT.getSizeInBits();
   int EltBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
 
-  APInt KnownUndef, KnownZero;
-  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
-  APInt Zeroable = KnownUndef | KnownZero;
   if (Zeroable.isAllOnes())
     return DAG.getConstant(0, DL, VT);
 
@@ -1062,6 +1175,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
          "Unexpected mask size for shuffle!");
   assert(Mask.size() % 2 == 0 && "Expected even mask size.");
 
+  APInt KnownUndef, KnownZero;
+  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+  APInt Zeroable = KnownUndef | KnownZero;
+
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
@@ -1089,12 +1206,14 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
+  if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
+                                                     Zeroable)))
+    return Result;
   if ((Result =
-           lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+           lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-
   return SDValue();
 }
 
@@ -5041,6 +5160,10 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VANY_NONZERO)
     NODE_NAME_CASE(FRECIPE)
     NODE_NAME_CASE(FRSQRTE)
+    NODE_NAME_CASE(VSLLI)
+    NODE_NAME_CASE(VSRLI)
+    NODE_NAME_CASE(VBSLL)
+    NODE_NAME_CASE(VBSRL)
   }
 #undef NODE_NAME_CASE
   return nullptr;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -147,7 +147,15 @@ enum NodeType : unsigned {
 
   // Floating point approximate reciprocal operation
   FRECIPE,
-  FRSQRTE
+  FRSQRTE,
+
+  // Vector logicial left / right shift by immediate
+  VSLLI,
+  VSRLI,
+
+  // Vector byte logicial left / right shift
+  VBSLL,
+  VBSRL
 
   // Intrinsic operations end =============================================
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -58,6 +58,12 @@ def loongarch_vreplgr2vr: SDNode<"LoongArchISD::VREPLGR2VR", SDT_LoongArchVreplg
 def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>;
 def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>;
 
+def loongarch_vslli : SDNode<"LoongArchISD::VSLLI", SDT_LoongArchV1RUimm>;
+def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>;
+
+def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>;
+def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>;
+
 def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
 def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
@@ -1494,15 +1500,59 @@ def : Pat<(or (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
 def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
           (VXORI_B LSX128:$vj, uimm8:$imm)>;
 
+// VBSLL_V
+def : Pat<(loongarch_vbsll v16i8:$vj, uimm5:$imm), (VBSLL_V v16i8:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v8i16:$vj, uimm5:$imm), (VBSLL_V v8i16:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4i32:$vj, uimm5:$imm), (VBSLL_V v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2i64:$vj, uimm5:$imm), (VBSLL_V v2i64:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4f32:$vj, uimm5:$imm), (VBSLL_V v4f32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2f64:$vj, uimm5:$imm), (VBSLL_V v2f64:$vj,
+                                                       uimm5:$imm)>;
+
+// VBSRL_V
+def : Pat<(loongarch_vbsrl v16i8:$vj, uimm5:$imm), (VBSRL_V v16i8:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v8i16:$vj, uimm5:$imm), (VBSRL_V v8i16:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4i32:$vj, uimm5:$imm), (VBSRL_V v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2i64:$vj, uimm5:$imm), (VBSRL_V v2i64:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4f32:$vj, uimm5:$imm), (VBSRL_V v4f32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2f64:$vj, uimm5:$imm), (VBSRL_V v2f64:$vj,
+                                                       uimm5:$imm)>;
+
 // VSLL[I]_{B/H/W/D}
 defm : PatVrVr<shl, "VSLL">;
 defm : PatShiftVrVr<shl, "VSLL">;
 defm : PatShiftVrUimm<shl, "VSLLI">;
+def : Pat<(loongarch_vslli v16i8:$vj, uimm3:$imm), (VSLLI_B v16i8:$vj,
+                                                       uimm3:$imm)>;
+def : Pat<(loongarch_vslli v8i16:$vj, uimm4:$imm), (VSLLI_H v8i16:$vj,
+                                                       uimm4:$imm)>;
+def : Pat<(loongarch_vslli v4i32:$vj, uimm5:$imm), (VSLLI_W v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vslli v2i64:$vj, uimm6:$imm), (VSLLI_D v2i64:$vj,
+                                                       uimm6:$imm)>;
 
 // VSRL[I]_{B/H/W/D}
 defm : PatVrVr<srl, "VSRL">;
 defm : PatShiftVrVr<srl, "VSRL">;
 defm : PatShiftVrUimm<srl, "VSRLI">;
+def : Pat<(loongarch_vsrli v16i8:$vj, uimm3:$imm), (VSRLI_B v16i8:$vj,
+                                                       uimm3:$imm)>;
+def : Pat<(loongarch_vsrli v8i16:$vj, uimm4:$imm), (VSRLI_H v8i16:$vj,
+                                                       uimm4:$imm)>;
+def : Pat<(loongarch_vsrli v4i32:$vj, uimm5:$imm), (VSRLI_W v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vsrli v2i64:$vj, uimm6:$imm), (VSRLI_D v2i64:$vj,
+                                                       uimm6:$imm)>;
 
 // VSRA[I]_{B/H/W/D}
 defm : PatVrVr<sra, "VSRA">;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,11 +374,8 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI24_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 32
+; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i32>, ptr %src
   %e = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll