Skip to content

Commit 7f4f237

Browse files
committed
[VectorCombine] foldShuffleOfShuffles - add missing arguments to getShuffleCost calls.
Ensure the getShuffleCost arguments/instruction args are populated - minor extension to #88743 to help improve shuffle costs for certain corner cases (e.g. shuffles of loads)
1 parent 17fb3e8 commit 7f4f237

File tree

2 files changed

+20
-22
lines changed

2 files changed

+20
-22
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,6 +1566,8 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
15661566
m_Mask(OuterMask))))
15671567
return false;
15681568

1569+
auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0));
1570+
auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1));
15691571
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
15701572
auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
15711573
auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
@@ -1607,14 +1609,15 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
16071609

16081610
InstructionCost OldCost =
16091611
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
1610-
InnerMask0, CostKind) +
1612+
InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0) +
16111613
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
1612-
InnerMask1, CostKind) +
1614+
InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1) +
16131615
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
1614-
OuterMask, CostKind, 0, nullptr, std::nullopt, &I);
1616+
OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I);
16151617

1616-
InstructionCost NewCost = TTI.getShuffleCost(
1617-
TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, NewMask, CostKind);
1618+
InstructionCost NewCost =
1619+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy,
1620+
NewMask, CostKind, 0, nullptr, {V0, V1});
16181621

16191622
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
16201623
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost

llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
3-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
2+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
3+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
44

55
; fold to identity
66

@@ -44,22 +44,17 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
4444
ret <8 x i32> %concat
4545
}
4646

47+
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
48+
4749
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
48-
; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
49-
; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
50-
; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
51-
; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
52-
; SSE-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
53-
; SSE-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
54-
; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
55-
; SSE-NEXT: ret <4 x double> [[BLEND]]
56-
;
57-
; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
58-
; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
59-
; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
60-
; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
61-
; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
62-
; AVX-NEXT: ret <4 x double> [[BLEND]]
50+
; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
51+
; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
52+
; CHECK-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
53+
; CHECK-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
54+
; CHECK-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
55+
; CHECK-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
56+
; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
57+
; CHECK-NEXT: ret <4 x double> [[BLEND]]
6358
;
6459
%ld0 = load <4 x double>, ptr %p0, align 32
6560
%ld1 = load <4 x double>, ptr %p1, align 32

0 commit comments

Comments
 (0)