[AArch64][TTI] Improve LegalVF when gather loads are scalarized

antoniofrighetto · antoniofrighetto · commit 138e6c1c86e8 · 2023-10-27T20:22:54.000+02:00
After determining the cost of loads that could not be coalesced into `VectorizedLoads` in SLP, computing the cost of a gather-vectorized load is carried out. Favour a potentially high valid cost when the type of a group of loads, whose type is a vector of size dependent upon `VF`, may be legalized into a scalar value. Fixes: #68953.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2996,14 +2996,18 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
-  if (useNeonVector(DataTy))
+  if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
   auto *VT = cast<VectorType>(DataTy);
   auto LT = getTypeLegalizationCost(DataTy);
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  if (!LT.second.isVector() ||
+      !isElementTypeLegalForScalableVector(VT->getElementType()))
+    return InstructionCost::getInvalid();
+
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -105,9 +105,27 @@ define void @masked_gathers_no_vscale_range() #2 {
   ret void
 }
 
+define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 {
+; CHECK-LABEL: 'masked_gather_v1i128'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+  %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+  ret <2 x i128> %res
+}
+
 attributes #0 = { "target-features"="+sve" vscale_range(1, 8) }
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) "tune-cpu"="generic" }
 attributes #2 = { "target-features"="+sve" }
+attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
 
 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
@@ -120,3 +138,4 @@ declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr>, i32
 declare <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x ptr>, i32, <vscale x 16 x i1>, <vscale x 16 x i16>)
 declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
 declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb < %s | FileCheck %s
+
+define void @gather_load_fp128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_fp128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load fp128, ptr %arg, align 1
+  %load1 = load fp128, ptr %gep, align 1
+  %load2 = load fp128, ptr null, align 1
+  %load3 = load fp128, ptr null, align 1
+  %fcmp0 = fcmp oeq fp128 %load0, 0xL0
+  %fcmp1 = fcmp oeq fp128 %load1, 0xL0
+  %fcmp2 = fcmp oeq fp128 %load2, 0xL0
+  %fcmp3 = fcmp oeq fp128 %load3, 0xL0
+  ret void
+}
+
+define void @gather_load_i128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_i128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i128 [[LOAD0]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i128 [[LOAD1]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i128 [[LOAD2]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i128 [[LOAD3]], 0
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load i128, ptr %arg, align 1
+  %load1 = load i128, ptr %gep, align 1
+  %load2 = load i128, ptr null, align 1
+  %load3 = load i128, ptr null, align 1
+  %cmp0 = icmp eq i128 %load0, 0
+  %cmp1 = icmp eq i128 %load1, 0
+  %cmp2 = icmp eq i128 %load2, 0
+  %cmp3 = icmp eq i128 %load3, 0
+  ret void
+}
+
+attributes #0 = { vscale_range(2,2) }