-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LoongArch] lower vector shuffle to shift if possible #132866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-loongarch Author: None (tangaac) ChangesPatch is 62.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132866.diff 6 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 8d80a1ba55bcb..269921d80091e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -519,6 +519,121 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
}
}
+/// Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI
+/// instruction.
+// The funciton matches elements form one of the input vector shuffled to the
+// left or right with zeroable elements 'shifted in'. It handles both the
+// strictly bit-wise element shifts and the byte shfit across an entire 128-bit
+// lane.
+// Mostly copied from X86.
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+ int MaskOffset, const APInt &Zeroable) {
+ int Size = Mask.size();
+ unsigned SizeInBits = Size * ScalarSizeInBits;
+
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
+
+ return true;
+ };
+
+ auto isSequentialOrUndefInRange = [&](unsigned Pos, unsigned Size, int Low,
+ int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
+ if (!(Mask[i] == -1 || Mask[i] == Low))
+ return false;
+ return true;
+ };
+
+ auto MatchShift = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Pos, Len, Low + MaskOffset))
+ return -1;
+ }
+
+ int ShiftEltBits = ScalarSizeInBits * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ Opcode = Left ? (ByteShift ? LoongArchISD::VBSLL : LoongArchISD::VSLLI)
+ : (ByteShift ? LoongArchISD::VBSRL : LoongArchISD::VSRLI);
+ int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
+
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+ ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+ : MVT::getVectorVT(ShiftSVT, Size / Scale);
+ return (int)ShiftAmt;
+ };
+
+ unsigned MaxWidth = 128;
+ for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left)) {
+ int ShiftAmt = MatchShift(Shift, Scale, Left);
+ if (0 < ShiftAmt)
+ return ShiftAmt;
+ }
+
+ // no match
+ return -1;
+}
+
+/// Lower VECTOR_SHUFFLE as shift (if possible).
+///
+/// For example:
+/// %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+/// <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+/// is lowered to:
+/// (VBSLL_V $v0, $v0, 4)
+///
+/// %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+/// <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+/// is lowered to:
+/// (VSLLI_D $v0, $v0, 32)
+static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG,
+ const APInt &Zeroable) {
+ int Size = Mask.size();
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ MVT ShiftVT;
+ SDValue V = V1;
+ unsigned Opcode;
+
+ // Try to match shuffle against V1 shift.
+ int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, 0, Zeroable);
+
+ // If V1 failed, try to match shuffle against V2 shift.
+ if (ShiftAmt < 0) {
+ ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable);
+ V = V2;
+ }
+
+ if (ShiftAmt < 0)
+ return SDValue();
+
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+ V = DAG.getNode(Opcode, DL, ShiftVT, V,
+ DAG.getConstant(ShiftAmt, DL, MVT::i64));
+ return DAG.getBitcast(VT, V);
+}
+
/// Determine whether a range fits a regular pattern of values.
/// This function accounts for the possibility of jumping over the End iterator.
template <typename ValType>
@@ -587,14 +702,12 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
ArrayRef<int> Mask, MVT VT,
SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ const APInt &Zeroable) {
int Bits = VT.getSizeInBits();
int EltBits = VT.getScalarSizeInBits();
int NumElements = VT.getVectorNumElements();
- APInt KnownUndef, KnownZero;
- computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
- APInt Zeroable = KnownUndef | KnownZero;
if (Zeroable.isAllOnes())
return DAG.getConstant(0, DL, VT);
@@ -1056,6 +1169,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
"Unexpected mask size for shuffle!");
assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+ APInt KnownUndef, KnownZero;
+ computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+ APInt Zeroable = KnownUndef | KnownZero;
+
SDValue Result;
// TODO: Add more comparison patterns.
if (V2.isUndef()) {
@@ -1083,12 +1200,14 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
return Result;
+ if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
+ Zeroable)))
+ return Result;
if ((Result =
- lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+ lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
return Result;
if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
return Result;
-
return SDValue();
}
@@ -4997,6 +5116,10 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VANY_NONZERO)
NODE_NAME_CASE(FRECIPE)
NODE_NAME_CASE(FRSQRTE)
+ NODE_NAME_CASE(VSLLI)
+ NODE_NAME_CASE(VSRLI)
+ NODE_NAME_CASE(VBSLL)
+ NODE_NAME_CASE(VBSRL)
}
#undef NODE_NAME_CASE
return nullptr;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 002fad0e20759..52d88b9b24a6b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -147,7 +147,15 @@ enum NodeType : unsigned {
// Floating point approximate reciprocal operation
FRECIPE,
- FRSQRTE
+ FRSQRTE,
+
+ // Vector logicial left / right shift by immediate
+ VSLLI,
+ VSRLI,
+
+ // Vector byte logicial left / right shift
+ VBSLL,
+ VBSRL
// Intrinsic operations end =============================================
};
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d2063a8aaae9b..ecbcd29d88aac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -58,6 +58,12 @@ def loongarch_vreplgr2vr: SDNode<"LoongArchISD::VREPLGR2VR", SDT_LoongArchVreplg
def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>;
def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>;
+def loongarch_vslli : SDNode<"LoongArchISD::VSLLI", SDT_LoongArchV1RUimm>;
+def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>;
+
+def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>;
+def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>;
+
def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
@@ -1494,15 +1500,59 @@ def : Pat<(or (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
(VXORI_B LSX128:$vj, uimm8:$imm)>;
+// VBSLL_V
+def : Pat<(loongarch_vbsll v16i8:$vj, uimm5:$imm), (VBSLL_V v16i8:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v8i16:$vj, uimm5:$imm), (VBSLL_V v8i16:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4i32:$vj, uimm5:$imm), (VBSLL_V v4i32:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2i64:$vj, uimm5:$imm), (VBSLL_V v2i64:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4f32:$vj, uimm5:$imm), (VBSLL_V v4f32:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2f64:$vj, uimm5:$imm), (VBSLL_V v2f64:$vj,
+ uimm5:$imm)>;
+
+// VBSRL_V
+def : Pat<(loongarch_vbsrl v16i8:$vj, uimm5:$imm), (VBSRL_V v16i8:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v8i16:$vj, uimm5:$imm), (VBSRL_V v8i16:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4i32:$vj, uimm5:$imm), (VBSRL_V v4i32:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2i64:$vj, uimm5:$imm), (VBSRL_V v2i64:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4f32:$vj, uimm5:$imm), (VBSRL_V v4f32:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2f64:$vj, uimm5:$imm), (VBSRL_V v2f64:$vj,
+ uimm5:$imm)>;
+
// VSLL[I]_{B/H/W/D}
defm : PatVrVr<shl, "VSLL">;
defm : PatShiftVrVr<shl, "VSLL">;
defm : PatShiftVrUimm<shl, "VSLLI">;
+def : Pat<(loongarch_vslli v16i8:$vj, uimm3:$imm), (VSLLI_B v16i8:$vj,
+ uimm3:$imm)>;
+def : Pat<(loongarch_vslli v8i16:$vj, uimm4:$imm), (VSLLI_H v8i16:$vj,
+ uimm4:$imm)>;
+def : Pat<(loongarch_vslli v4i32:$vj, uimm5:$imm), (VSLLI_W v4i32:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vslli v2i64:$vj, uimm6:$imm), (VSLLI_D v2i64:$vj,
+ uimm6:$imm)>;
// VSRL[I]_{B/H/W/D}
defm : PatVrVr<srl, "VSRL">;
defm : PatShiftVrVr<srl, "VSRL">;
defm : PatShiftVrUimm<srl, "VSRLI">;
+def : Pat<(loongarch_vsrli v16i8:$vj, uimm3:$imm), (VSRLI_B v16i8:$vj,
+ uimm3:$imm)>;
+def : Pat<(loongarch_vsrli v8i16:$vj, uimm4:$imm), (VSRLI_H v8i16:$vj,
+ uimm4:$imm)>;
+def : Pat<(loongarch_vsrli v4i32:$vj, uimm5:$imm), (VSRLI_W v4i32:$vj,
+ uimm5:$imm)>;
+def : Pat<(loongarch_vsrli v2i64:$vj, uimm6:$imm), (VSRLI_D v2i64:$vj,
+ uimm6:$imm)>;
// VSRA[I]_{B/H/W/D}
defm : PatVrVr<sra, "VSRA">;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index 984b6f3d74866..d84e408cd28be 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,11 +374,8 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI24_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vsrli.d $vr0, $vr0, 32
+; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <4 x i32>, ptr %src
%e = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
index b590103511847..48f18a35a38c4 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
@@ -4,10 +4,7 @@
define <16 x i8> @shuffle_to_vslli_h_8(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vslli_h_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vslli.h $vr0, $vr0, 8
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
ret <16 x i8> %shuffle
@@ -16,10 +13,7 @@ define <16 x i8> @shuffle_to_vslli_h_8(<16 x i8> %a) nounwind {
define <16 x i8> @shuffle_to_vsrli_h_8(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vsrli_h_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vsrli.h $vr0, $vr0, 8
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
ret <16 x i8> %shuffle
@@ -28,10 +22,7 @@ define <16 x i8> @shuffle_to_vsrli_h_8(<16 x i8> %a) nounwind {
define <16 x i8> @shuffle_to_vslli_w_8(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vslli_w_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vslli.w $vr0, $vr0, 8
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 1, i32 2, i32 16, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 16, i32 12, i32 13, i32 14>
ret <16 x i8> %shuffle
@@ -40,10 +31,7 @@ define <16 x i8> @shuffle_to_vslli_w_8(<16 x i8> %a) nounwind {
define <16 x i8> @shuffle_to_vsrli_w_8(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vsrli_w_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vsrli.w $vr0, $vr0, 8
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 16, i32 13, i32 14, i32 15, i32 16>
ret <16 x i8> %shuffle
@@ -52,11 +40,7 @@ define <16 x i8> @shuffle_to_vsrli_w_8(<16 x i8> %a) nounwind {
define <8 x i16> @shuffle_to_vslli_w_16(<8 x i16> %a) nounwind {
; CHECK-LABEL: shuffle_to_vslli_w_16:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: vslli.w $vr0, $vr0, 16
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
ret <8 x i16> %shuffle
@@ -65,11 +49,7 @@ define <8 x i16> @shuffle_to_vslli_w_16(<8 x i16> %a) nounwind {
define <8 x i16> @shuffle_to_vsrli_w_16(<8 x i16> %a) nounwind {
; CHECK-LABEL: shuffle_to_vsrli_w_16:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: vsrli.w $vr0, $vr0, 16
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 5, i32 8, i32 7, i32 8>
ret <8 x i16> %shuffle
@@ -78,10 +58,7 @@ define <8 x i16> @shuffle_to_vsrli_w_16(<8 x i16> %a) nounwind {
define <16 x i8> @shuffle_to_vslli_w_24(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vslli_w_24:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vslli.w $vr0, $vr0, 24
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
ret <16 x i8> %shuffle
@@ -90,10 +67,7 @@ define <16 x i8> @shuffle_to_vslli_w_24(<16 x i8> %a) nounwind {
define <16 x i8> @shuffle_to_vsrli_w_24(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vsrli_w_24:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vsrli.w $vr0, $vr0, 24
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 3, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16>
ret <16 x i8> %shuffle
@@ -102,10 +76,7 @@ define <16 x i8> @shuffle_to_vsrli_w_24(<16 x i8> %a) nounwind {
define <16 x i8> @shuffle_to_vslli_d_8(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vslli_d_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vslli.d $vr0, $vr0, 8
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
ret <16 x i8> %shuffle
@@ -114,10 +85,7 @@ define <16 x i8> @shuffle_to_vslli_d_8(<16 x i8> %a) nounwind {
define <16 x i8> @shuffle_to_vsrli_d_8(<16 x i8> %a) nounwind {
; CHECK-LABEL: shuffle_to_vsrli_d_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT: vsrli.d $vr0, $vr0, 8
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
ret <16 x i8> %shuffle
@@ -126,11 +94,7 @@ define <16 x i8> @shuffle_to_vsrli_d_8(<16 x i8> %a) nounwind {
define <8 x i16> @shuffle_to_vslli_d_16(<8 x i16> %a) nounwind {
; CHECK-LABEL: shuffle_to_vslli_d_...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice! A couple of questions:
- Does some workloads or benchmarks benefit from this change? If yes, could you post the data?
- Does this optimization apply to the 256-bits vector?
About the title, I think it's better to add "if possible" at the end. And could you write some useful explanation about the implementation in the commit message? |
|
75de3f4
to
ded9f43
Compare
Ping |
Will this optimization be added for |
It's for lasx now |
268039e
to
5e8c1cc
Compare
Here are the files optimized by this pr on llvm-test-suite (lasx): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.