Skip to content

Commit 138e6c1

Browse files
[AArch64][TTI] Improve LegalVF when gather loads are scalarized
After determining the cost of loads that could not be coalesced into `VectorizedLoads` in SLP, computing the cost of a gather-vectorized load is carried out. Favour a potentially high valid cost when the type of a group of loads, whose type is a vector of size dependent upon `VF`, may be legalized into a scalar value. Fixes: #68953.
1 parent 6e863c4 commit 138e6c1

File tree

3 files changed

+78
-1
lines changed

3 files changed

+78
-1
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2996,14 +2996,18 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
29962996
InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
29972997
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
29982998
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2999-
if (useNeonVector(DataTy))
2999+
if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
30003000
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
30013001
Alignment, CostKind, I);
30023002
auto *VT = cast<VectorType>(DataTy);
30033003
auto LT = getTypeLegalizationCost(DataTy);
30043004
if (!LT.first.isValid())
30053005
return InstructionCost::getInvalid();
30063006

3007+
if (!LT.second.isVector() ||
3008+
!isElementTypeLegalForScalableVector(VT->getElementType()))
3009+
return InstructionCost::getInvalid();
3010+
30073011
// The code-generator is currently not able to handle scalable vectors
30083012
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
30093013
// it. This change will be removed when code-generation for these types is

llvm/test/Analysis/CostModel/AArch64/sve-gather.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,27 @@ define void @masked_gathers_no_vscale_range() #2 {
105105
ret void
106106
}
107107

108+
define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 {
109+
; CHECK-LABEL: 'masked_gather_v1i128'
110+
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
111+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
112+
;
113+
; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
114+
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
115+
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
116+
;
117+
; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
118+
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
119+
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
120+
;
121+
%res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
122+
ret <2 x i128> %res
123+
}
124+
108125
attributes #0 = { "target-features"="+sve" vscale_range(1, 8) }
109126
attributes #1 = { "target-features"="+sve" vscale_range(1, 16) "tune-cpu"="generic" }
110127
attributes #2 = { "target-features"="+sve" }
128+
attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
111129

112130
declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
113131
declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
@@ -120,3 +138,4 @@ declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr>, i32
120138
declare <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x ptr>, i32, <vscale x 16 x i1>, <vscale x 16 x i16>)
121139
declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
122140
declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
141+
declare <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i128>)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb < %s | FileCheck %s
3+
4+
define void @gather_load_fp128(ptr %arg) #0 {
5+
; CHECK-LABEL: @gather_load_fp128(
6+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
7+
; CHECK-NEXT: [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1
8+
; CHECK-NEXT: [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1
9+
; CHECK-NEXT: [[LOAD2:%.*]] = load fp128, ptr null, align 1
10+
; CHECK-NEXT: [[LOAD3:%.*]] = load fp128, ptr null, align 1
11+
; CHECK-NEXT: [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000
12+
; CHECK-NEXT: [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000
13+
; CHECK-NEXT: [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000
14+
; CHECK-NEXT: [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000
15+
; CHECK-NEXT: ret void
16+
;
17+
%gep = getelementptr i8, ptr %arg, i64 16
18+
%load0 = load fp128, ptr %arg, align 1
19+
%load1 = load fp128, ptr %gep, align 1
20+
%load2 = load fp128, ptr null, align 1
21+
%load3 = load fp128, ptr null, align 1
22+
%fcmp0 = fcmp oeq fp128 %load0, 0xL0
23+
%fcmp1 = fcmp oeq fp128 %load1, 0xL0
24+
%fcmp2 = fcmp oeq fp128 %load2, 0xL0
25+
%fcmp3 = fcmp oeq fp128 %load3, 0xL0
26+
ret void
27+
}
28+
29+
define void @gather_load_i128(ptr %arg) #0 {
30+
; CHECK-LABEL: @gather_load_i128(
31+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
32+
; CHECK-NEXT: [[LOAD0:%.*]] = load i128, ptr [[ARG]], align 1
33+
; CHECK-NEXT: [[LOAD1:%.*]] = load i128, ptr [[GEP]], align 1
34+
; CHECK-NEXT: [[LOAD2:%.*]] = load i128, ptr null, align 1
35+
; CHECK-NEXT: [[LOAD3:%.*]] = load i128, ptr null, align 1
36+
; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i128 [[LOAD0]], 0
37+
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i128 [[LOAD1]], 0
38+
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i128 [[LOAD2]], 0
39+
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i128 [[LOAD3]], 0
40+
; CHECK-NEXT: ret void
41+
;
42+
%gep = getelementptr i8, ptr %arg, i64 16
43+
%load0 = load i128, ptr %arg, align 1
44+
%load1 = load i128, ptr %gep, align 1
45+
%load2 = load i128, ptr null, align 1
46+
%load3 = load i128, ptr null, align 1
47+
%cmp0 = icmp eq i128 %load0, 0
48+
%cmp1 = icmp eq i128 %load1, 0
49+
%cmp2 = icmp eq i128 %load2, 0
50+
%cmp3 = icmp eq i128 %load3, 0
51+
ret void
52+
}
53+
54+
attributes #0 = { vscale_range(2,2) }

0 commit comments

Comments
 (0)