Skip to content

Commit 594e7ab

Browse files
david-armfrederik-h
authored andcommitted
[LV][NFC] Refactor code for extracting first active element (llvm#131118)
Refactor the code to extract the first active element of a vector in the early exit block, in preparation for PR llvm#130766. I've replaced the VPInstruction::ExtractFirstActive nodes with a combination of a new VPInstruction::FirstActiveLane node and a Instruction::ExtractElement node.
1 parent a0e5c9f commit 594e7ab

File tree

5 files changed

+49
-30
lines changed

5 files changed

+49
-30
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -877,9 +877,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
877877
// Returns a scalar boolean value, which is true if any lane of its (only
878878
// boolean) vector operand is true.
879879
AnyOf,
880-
// Extracts the first active lane of a vector, where the first operand is
881-
// the predicate, and the second operand is the vector to extract.
882-
ExtractFirstActive,
880+
// Calculates the first active lane index of the vector predicate operand.
881+
FirstActiveLane,
883882
};
884883

885884
private:

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
5050
return SetResultTyFromOp();
5151

5252
switch (Opcode) {
53+
case Instruction::ExtractElement:
54+
return inferScalarType(R->getOperand(0));
5355
case Instruction::Select: {
5456
Type *ResTy = inferScalarType(R->getOperand(1));
5557
VPValue *OtherV = R->getOperand(2);
@@ -82,7 +84,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
8284
case VPInstruction::CanonicalIVIncrementForPart:
8385
case VPInstruction::AnyOf:
8486
return SetResultTyFromOp();
85-
case VPInstruction::ExtractFirstActive:
87+
case VPInstruction::FirstActiveLane:
88+
return Type::getIntNTy(Ctx, 64);
8689
case VPInstruction::ExtractFromEnd: {
8790
Type *BaseTy = inferScalarType(R->getOperand(0));
8891
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
468468
Value *A = State.get(getOperand(0));
469469
return Builder.CreateNot(A, Name);
470470
}
471+
case Instruction::ExtractElement: {
472+
assert(State.VF.isVector() && "Only extract elements from vectors");
473+
Value *Vec = State.get(getOperand(0));
474+
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
475+
return Builder.CreateExtractElement(Vec, Idx, Name);
476+
}
471477
case Instruction::ICmp: {
472478
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
473479
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -723,12 +729,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
723729
Value *A = State.get(getOperand(0));
724730
return Builder.CreateOrReduce(A);
725731
}
726-
case VPInstruction::ExtractFirstActive: {
727-
Value *Vec = State.get(getOperand(0));
728-
Value *Mask = State.get(getOperand(1));
729-
Value *Ctz = Builder.CreateCountTrailingZeroElems(
730-
Builder.getInt64Ty(), Mask, true, "first.active.lane");
731-
return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
732+
case VPInstruction::FirstActiveLane: {
733+
Value *Mask = State.get(getOperand(0));
734+
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
735+
true, Name);
732736
}
733737
default:
734738
llvm_unreachable("Unsupported opcode for instruction");
@@ -755,22 +759,24 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
755759
}
756760

757761
switch (getOpcode()) {
762+
case Instruction::ExtractElement: {
763+
// Add on the cost of extracting the element.
764+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
765+
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
766+
Ctx.CostKind);
767+
}
758768
case VPInstruction::AnyOf: {
759769
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
760770
return Ctx.TTI.getArithmeticReductionCost(
761771
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
762772
}
763-
case VPInstruction::ExtractFirstActive: {
773+
case VPInstruction::FirstActiveLane: {
764774
// Calculate the cost of determining the lane index.
765-
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
775+
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
766776
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
767777
Type::getInt64Ty(Ctx.LLVMCtx),
768778
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
769-
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
770-
// Add on the cost of extracting the element.
771-
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
772-
return Cost + Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
773-
Ctx.CostKind);
779+
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
774780
}
775781
case VPInstruction::FirstOrderRecurrenceSplice: {
776782
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
@@ -793,7 +799,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
793799

794800
bool VPInstruction::isVectorToScalar() const {
795801
return getOpcode() == VPInstruction::ExtractFromEnd ||
796-
getOpcode() == VPInstruction::ExtractFirstActive ||
802+
getOpcode() == Instruction::ExtractElement ||
803+
getOpcode() == VPInstruction::FirstActiveLane ||
797804
getOpcode() == VPInstruction::ComputeReductionResult ||
798805
getOpcode() == VPInstruction::AnyOf;
799806
}
@@ -853,13 +860,14 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
853860
if (Instruction::isBinaryOp(getOpcode()))
854861
return false;
855862
switch (getOpcode()) {
863+
case Instruction::ExtractElement:
856864
case Instruction::ICmp:
857865
case Instruction::Select:
858866
case VPInstruction::AnyOf:
859867
case VPInstruction::CalculateTripCountMinusVF:
860868
case VPInstruction::CanonicalIVIncrementForPart:
861869
case VPInstruction::ExtractFromEnd:
862-
case VPInstruction::ExtractFirstActive:
870+
case VPInstruction::FirstActiveLane:
863871
case VPInstruction::FirstOrderRecurrenceSplice:
864872
case VPInstruction::LogicalAnd:
865873
case VPInstruction::Not:
@@ -878,6 +886,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
878886
switch (getOpcode()) {
879887
default:
880888
return false;
889+
case Instruction::ExtractElement:
890+
return Op == getOperand(1);
881891
case Instruction::PHI:
882892
return true;
883893
case Instruction::ICmp:
@@ -970,7 +980,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
970980
case VPInstruction::Broadcast:
971981
O << "broadcast";
972982
break;
973-
974983
case VPInstruction::ExtractFromEnd:
975984
O << "extract-from-end";
976985
break;
@@ -986,8 +995,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
986995
case VPInstruction::AnyOf:
987996
O << "any-of";
988997
break;
989-
case VPInstruction::ExtractFirstActive:
990-
O << "extract-first-active";
998+
case VPInstruction::FirstActiveLane:
999+
O << "first-active-lane";
9911000
break;
9921001
default:
9931002
O << Instruction::getOpcodeName(getOpcode());

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2158,10 +2158,14 @@ void VPlanTransforms::handleUncountableEarlyExit(
21582158
ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
21592159
}
21602160
// Add the incoming value from the early exit.
2161-
if (!IncomingFromEarlyExit->isLiveIn())
2162-
IncomingFromEarlyExit =
2163-
EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive,
2164-
{IncomingFromEarlyExit, EarlyExitTakenCond});
2161+
if (!IncomingFromEarlyExit->isLiveIn()) {
2162+
VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2163+
VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr,
2164+
"first.active.lane");
2165+
IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2166+
Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
2167+
nullptr, "early.exit.value");
2168+
}
21652169
ExitIRI->addOperand(IncomingFromEarlyExit);
21662170
}
21672171
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
1111
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve'
1212
; CHECK: LV: Selecting VF: vscale x 16
1313
; CHECK: Calculating cost of work in exit block vector.early.exit
14-
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
15-
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
14+
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
15+
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
16+
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
17+
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
1618
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
1719
entry:
1820
%p1 = alloca [1024 x i8]
@@ -48,8 +50,10 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
4850
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve'
4951
; CHECK: LV: Selecting VF: 16
5052
; CHECK: Calculating cost of work in exit block vector.early.exit
51-
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
52-
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
53+
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
54+
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
55+
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
56+
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
5357
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
5458
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
5559
; CHECK-NEXT: LV: Too many memory checks needed.

0 commit comments

Comments
 (0)