Skip to content

Commit 6d7bb66

Browse files
committed
Address comments, add tests for additional dependences.
1 parent f3232c6 commit 6d7bb66

File tree

6 files changed

+176
-32
lines changed

6 files changed

+176
-32
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -414,9 +414,7 @@ class LoopVectorizationLegality {
414414
}
415415

416416
/// Returns a list of all known histogram operations in the loop.
417-
const SmallVectorImpl<HistogramInfo> &getHistograms() const {
418-
return Histograms;
419-
}
417+
bool hasHistograms() const { return !Histograms.empty(); }
420418

421419
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
422420
return &PSE;
@@ -467,9 +465,9 @@ class LoopVectorizationLegality {
467465
bool canVectorizeMemory();
468466

469467
/// If LAA cannot determine whether all dependences are safe, we may be able
470-
/// to further analyse some unknown dependences and if they match a certain
471-
/// pattern (like a histogram) then we may still be able to vectorize.
472-
bool canVectorizeUncheckedDependences();
468+
/// to further analyse some IndirectUnsafe dependences and if they match a
469+
/// certain pattern (like a histogram) then we may still be able to vectorize.
470+
bool canVectorizeIndirectUnsafeDependences();
473471

474472
/// Return true if we can vectorize this loop using the IF-conversion
475473
/// transformation.

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,29 +1129,38 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
11291129
return true;
11301130
}
11311131

1132-
bool LoopVectorizationLegality::canVectorizeUncheckedDependences() {
1133-
// For now, we only support an unknown dependency that calculates a histogram
1132+
bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1133+
// For now, we only support an IndirectUnsafe dependency that calculates
1134+
// a histogram
11341135
if (!EnableHistogramVectorization)
11351136
return false;
11361137

1137-
// FIXME: Support more than one unchecked dependence, and check to see if some
1138-
// are handled by runtime checks before looking for histograms.
1138+
// Find a single IndirectUnsafe dependency.
11391139
LAI = &LAIs.getInfo(*TheLoop);
1140+
const MemoryDepChecker::Dependence *IUDep = nullptr;
11401141
const MemoryDepChecker &DepChecker = LAI->getDepChecker();
11411142
const auto *Deps = DepChecker.getDependences();
1142-
if (!Deps || Deps->size() != 1)
1143-
return false;
1143+
for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1144+
// Ignore dependencies that are either known to be safe or can be
1145+
// checked at runtime.
1146+
if (MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) !=
1147+
MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
1148+
continue;
11441149

1145-
const MemoryDepChecker::Dependence &Dep = (*Deps).front();
1150+
// We're only interested in IndirectUnsafe dependencies here, where the
1151+
// address might come from a load from memory. We also only want to handle
1152+
// one such dependency, at least for now.
1153+
if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
1154+
return false;
11461155

1147-
// We're only interested in Unknown or IndirectUnsafe dependences.
1148-
if (Dep.Type != MemoryDepChecker::Dependence::Unknown &&
1149-
Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe)
1156+
IUDep = &Dep;
1157+
}
1158+
if (!IUDep)
11501159
return false;
11511160

11521161
// For now only normal loads and stores are supported.
1153-
LoadInst *LI = dyn_cast<LoadInst>(Dep.getSource(DepChecker));
1154-
StoreInst *SI = dyn_cast<StoreInst>(Dep.getDestination(DepChecker));
1162+
LoadInst *LI = dyn_cast<LoadInst>(IUDep->getSource(DepChecker));
1163+
StoreInst *SI = dyn_cast<StoreInst>(IUDep->getDestination(DepChecker));
11551164

11561165
if (!LI || !SI)
11571166
return false;
@@ -1171,7 +1180,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
11711180
}
11721181

11731182
if (!LAI->canVectorizeMemory())
1174-
return canVectorizeUncheckedDependences();
1183+
return canVectorizeIndirectUnsafeDependences();
11751184

11761185
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
11771186
reportVectorizationFailure("We don't allow storing to uniform addresses",

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4653,7 +4653,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46534653
return false;
46544654

46554655
// Loops containing histograms are not currently supported.
4656-
return Legal->getHistograms().empty();
4656+
return !Legal->hasHistograms();
46574657
}
46584658

46594659
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
@@ -10056,7 +10056,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1005610056
// If there is a histogram in the loop, do not just interleave without
1005710057
// vectorizing. The order of operations will be incorrect without the
1005810058
// histogram intrinsics, which are only used for recipes with VF > 1.
10059-
if (!VectorizeLoop && InterleaveLoop && !LVL.getHistograms().empty()) {
10059+
if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
1006010060
LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
1006110061
<< "to histogram operations.\n");
1006210062
IntDiagMsg = std::make_pair(

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,9 +1619,7 @@ class VPHistogramRecipe : public VPRecipeBase {
16191619
/// Return the mask operand if one was provided, or a null pointer if all
16201620
/// lanes should be executed unconditionally.
16211621
VPValue *getMask() const {
1622-
if (getNumOperands() == 3)
1623-
return getOperand(2);
1624-
return nullptr;
1622+
return getNumOperands() == 3 ? getOperand(2) : nullptr;
16251623
}
16261624

16271625
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -967,7 +967,7 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
967967

968968
for (unsigned Part = 0; Part < State.UF; ++Part) {
969969
Value *Address = State.get(getOperand(0), Part);
970-
Value *IncVec = State.get(getOperand(1), Part);
970+
Value *IncAmt = State.get(getOperand(1), Part, /*IsScalar=*/true);
971971
VectorType *VTy = cast<VectorType>(Address->getType());
972972

973973
// The histogram intrinsic requires a mask even if the recipe doesn't;
@@ -980,18 +980,12 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
980980
Mask = Builder.CreateVectorSplat(
981981
VTy->getElementCount(), ConstantInt::getTrue(Builder.getInt1Ty()));
982982

983-
// Not sure how to make IncAmt stay scalar yet. For now just extract the
984-
// first element and tidy up later.
985-
// FIXME: Do we actually want this to be scalar? We just splat it in the
986-
// backend anyway...
987-
Value *IncAmt = Builder.CreateExtractElement(IncVec, Builder.getInt64(0));
988-
989983
// If this is a subtract, we want to invert the increment amount. We may
990984
// add a separate intrinsic in future, but for now we'll try this.
991985
if (Opcode == Instruction::Sub)
992986
IncAmt = Builder.CreateNeg(IncAmt);
993987
else
994-
assert(Opcode == Instruction::Add);
988+
assert(Opcode == Instruction::Add && "only add or sub supported for now");
995989

996990
State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
997991
{VTy, IncAmt->getType()},

llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,17 @@ target triple = "aarch64-unknown-linux-gnu"
208208
; CHECK-NEXT: No successors
209209
; CHECK-NEXT: }
210210

211+
;; Confirm cost calculation for runtime checks
212+
; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_rtdepcheck'
213+
; CHECK: Calculating cost of runtime checks:
214+
; CHECK: Total cost of runtime checks:
215+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:
216+
217+
;; Confirm inability to vectorize with potential alias to buckets
218+
; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_unsafe_alias'
219+
; CHECK: LV: Can't vectorize due to memory conflicts
220+
; CHECK-NEXT: LV: Not vectorizing: Cannot prove legality.
221+
211222
define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
212223
; CHECK-LABEL: define void @simple_histogram(
213224
; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -817,6 +828,140 @@ for.exit:
817828
ret void
818829
}
819830

831+
;; Check that we can still vectorize a histogram when LAA found another dependency
832+
;; that doesn't conflict with the buckets.
833+
define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %indices, i64 %N) #0 {
834+
; CHECK-LABEL: define void @simple_histogram_rtdepcheck(
835+
; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
836+
; CHECK-NEXT: entry:
837+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
838+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
839+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
840+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP2]], [[N]]
841+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
842+
; CHECK: vector.memcheck:
843+
; CHECK-NEXT: [[ARRAY1:%.*]] = ptrtoint ptr [[ARRAY]] to i64
844+
; CHECK-NEXT: [[INDICES2:%.*]] = ptrtoint ptr [[INDICES]] to i64
845+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
846+
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
847+
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]]
848+
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
849+
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
850+
; CHECK: vector.ph:
851+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
852+
; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP6]], -4
853+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
854+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
855+
; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
856+
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
857+
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
858+
; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 2
859+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
860+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
861+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
862+
; CHECK: vector.body:
863+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
864+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
865+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
866+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
867+
; CHECK-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
868+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP13]]
869+
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP14]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
870+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]]
871+
; CHECK-NEXT: store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
872+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
873+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
874+
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
875+
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
876+
; CHECK: middle.block:
877+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
878+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
879+
; CHECK: scalar.ph:
880+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
881+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
882+
; CHECK: for.body:
883+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
884+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
885+
; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
886+
; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP17]] to i64
887+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
888+
; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
889+
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP18]], 1
890+
; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
891+
; CHECK-NEXT: [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
892+
; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
893+
; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4
894+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
895+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
896+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
897+
; CHECK: for.exit:
898+
; CHECK-NEXT: ret void
899+
;
900+
entry:
901+
br label %for.body
902+
903+
for.body:
904+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
905+
%arrayidx = getelementptr inbounds i32, ptr %indices, i64 %iv
906+
%0 = load i32, ptr %arrayidx, align 4
907+
%idxprom1 = zext i32 %0 to i64
908+
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
909+
%1 = load i32, ptr %arrayidx2, align 4
910+
%inc = add nsw i32 %1, 1
911+
store i32 %inc, ptr %arrayidx2, align 4
912+
%idx.addr = getelementptr inbounds i32, ptr %array, i64 %iv
913+
%iv.trunc = trunc i64 %iv to i32
914+
store i32 %iv.trunc, ptr %idx.addr, align 4
915+
%iv.next = add nuw nsw i64 %iv, 1
916+
%exitcond = icmp eq i64 %iv.next, %N
917+
br i1 %exitcond, label %for.exit, label %for.body
918+
919+
for.exit:
920+
ret void
921+
}
922+
923+
;; Make sure we don't vectorize if there's a potential alias between buckets
924+
;; and indices.
925+
define void @simple_histogram_unsafe_alias(ptr %buckets, ptr %indices, i64 %N) #0 {
926+
; CHECK-LABEL: define void @simple_histogram_unsafe_alias(
927+
; CHECK-SAME: ptr [[BUCKETS:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
928+
; CHECK-NEXT: entry:
929+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
930+
; CHECK: for.body:
931+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
932+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
933+
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
934+
; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
935+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
936+
; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
937+
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1
938+
; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
939+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
940+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
941+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
942+
; CHECK: for.exit:
943+
; CHECK-NEXT: ret void
944+
;
945+
entry:
946+
br label %for.body
947+
948+
for.body:
949+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
950+
%arrayidx = getelementptr inbounds i32, ptr %indices, i64 %iv
951+
%0 = load i32, ptr %arrayidx, align 4
952+
%idxprom1 = zext i32 %0 to i64
953+
%arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
954+
%1 = load i32, ptr %arrayidx2, align 4
955+
%inc = add nsw i32 %1, 1
956+
store i32 %inc, ptr %arrayidx2, align 4
957+
%iv.next = add nuw nsw i64 %iv, 1
958+
%exitcond = icmp eq i64 %iv.next, %N
959+
br i1 %exitcond, label %for.exit, label %for.body
960+
961+
for.exit:
962+
ret void
963+
}
964+
820965
attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
821966

822967
!0 = distinct !{!0, !1}

0 commit comments

Comments
 (0)