-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[RISCV] Use vsetvli instead of vlenb in Prologue/Epilogue #113756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Kito Cheng (kito-cheng) ChangesCurrently, we use However, this approach is NOT always interchangeable, as it changes the state of With further analysis, we may extend this approach beyond the prologue/epilogue in the future, but starting here should be a good first step. Patch is 560.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113756.diff 159 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 5dcec078856ead..299537e5047d2b 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -56,6 +56,8 @@ class RISCVExpandPseudo : public MachineFunctionPass {
MachineBasicBlock::iterator MBBI);
bool expandRV32ZdinxLoad(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
+ bool expandPseudoReadMulVLENB(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
#ifndef NDEBUG
unsigned getInstSizeInBytes(const MachineFunction &MF) const {
unsigned Size = 0;
@@ -164,6 +166,8 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
case RISCV::PseudoVMSET_M_B64:
// vmset.m vd => vmxnor.mm vd, vd, vd
return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXNOR_MM);
+ case RISCV::PseudoReadMulVLENB:
+ return expandPseudoReadMulVLENB(MBB, MBBI);
}
return false;
@@ -410,6 +414,39 @@ bool RISCVExpandPseudo::expandRV32ZdinxLoad(MachineBasicBlock &MBB,
return true;
}
+bool RISCVExpandPseudo::expandPseudoReadMulVLENB(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+ DebugLoc DL = MBBI->getDebugLoc();
+ Register Dst = MBBI->getOperand(0).getReg();
+ unsigned Mul = MBBI->getOperand(1).getImm();
+ RISCVII::VLMUL VLMUL = RISCVII::VLMUL::LMUL_1;
+ switch (Mul) {
+ case 1:
+ VLMUL = RISCVII::VLMUL::LMUL_1;
+ break;
+ case 2:
+ VLMUL = RISCVII::VLMUL::LMUL_2;
+ break;
+ case 4:
+ VLMUL = RISCVII::VLMUL::LMUL_4;
+ break;
+ case 8:
+ VLMUL = RISCVII::VLMUL::LMUL_8;
+ break;
+ default:
+ llvm_unreachable("Unexpected VLENB value");
+ }
+ unsigned VTypeImm = RISCVVType::encodeVTYPE(
+ VLMUL, /*SEW*/ 8, /*TailAgnostic*/ true, /*MaskAgnostic*/ true);
+
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::VSETVLI), Dst)
+ .addReg(RISCV::X0)
+ .addImm(VTypeImm);
+
+ MBBI->eraseFromParent();
+ return true;
+}
+
class RISCVPreRAExpandPseudo : public MachineFunctionPass {
public:
const RISCVSubtarget *STI;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b49cbab1876d79..b76b8e1df9996e 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -436,8 +436,8 @@ void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
// We must keep the stack pointer aligned through any intermediate
// updates.
- RI.adjustReg(MBB, MBBI, DL, SPReg, SPReg, Offset,
- Flag, getStackAlign());
+ RI.adjustReg(MBB, MBBI, DL, SPReg, SPReg, Offset, Flag, getStackAlign(),
+ /*IsPrologueOrEpilogue*/ true);
}
static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
@@ -621,7 +621,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// Allocate space on the stack if necessary.
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-StackSize), MachineInstr::FrameSetup,
- getStackAlign());
+ getStackAlign(), /*IsPrologueOrEpilogue*/ true);
}
// Emit ".cfi_def_cfa_offset RealStackSize"
@@ -666,9 +666,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// The frame pointer does need to be reserved from register allocation.
assert(MF.getRegInfo().isReserved(FPReg) && "FP not reserved");
- RI->adjustReg(MBB, MBBI, DL, FPReg, SPReg,
- StackOffset::getFixed(RealStackSize - RVFI->getVarArgsSaveSize()),
- MachineInstr::FrameSetup, getStackAlign());
+ RI->adjustReg(
+ MBB, MBBI, DL, FPReg, SPReg,
+ StackOffset::getFixed(RealStackSize - RVFI->getVarArgsSaveSize()),
+ MachineInstr::FrameSetup, getStackAlign(),
+ /*IsPrologueOrEpilogue*/ true);
// Emit ".cfi_def_cfa $fp, RVFI->getVarArgsSaveSize()"
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
@@ -686,7 +688,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
"SecondSPAdjustAmount should be greater than zero");
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-SecondSPAdjustAmount),
- MachineInstr::FrameSetup, getStackAlign());
+ MachineInstr::FrameSetup, getStackAlign(),
+ /*IsPrologueOrEpilogue*/ true);
// If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0",
// don't emit an sp-based .cfi_def_cfa_offset
@@ -765,7 +768,8 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF,
Register SPReg = getSPReg(STI);
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(StackSize),
- MachineInstr::FrameDestroy, getStackAlign());
+ MachineInstr::FrameDestroy, getStackAlign(),
+ /*IsPrologueOrEpilogue*/ true);
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -839,7 +843,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
if (!RestoreFP)
RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg,
StackOffset::getFixed(SecondSPAdjustAmount),
- MachineInstr::FrameDestroy, getStackAlign());
+ MachineInstr::FrameDestroy, getStackAlign(),
+ /*IsPrologueOrEpilogue*/ true);
}
// Restore the stack pointer using the value of the frame pointer. Only
@@ -857,7 +862,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
StackOffset::getFixed(-FPOffset), MachineInstr::FrameDestroy,
- getStackAlign());
+ getStackAlign(), /*IsPrologueOrEpilogue*/ true);
}
bool ApplyPop = RVFI->isPushable(MF) && MBBI != MBB.end() &&
@@ -1348,7 +1353,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
- MachineInstr::NoFlags, getStackAlign());
+ MachineInstr::NoFlags, getStackAlign(),
+ /*IsPrologueOrEpilogue*/ true);
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 6b308bc8c9aa0f..fa8cde09be696b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6096,6 +6096,11 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1 in {
[(set GPR:$rd, (riscv_read_vlenb))]>,
PseudoInstExpansion<(CSRRS GPR:$rd, SysRegVLENB.Encoding, X0)>,
Sched<[WriteRdVLENB]>;
+ let Defs = [VL, VTYPE] in {
+ def PseudoReadMulVLENB : Pseudo<(outs GPR:$rd), (ins uimm5:$shamt),
+ []>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ }
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 26195ef721db39..b37899b148c283 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -175,7 +175,8 @@ void RISCVRegisterInfo::adjustReg(MachineBasicBlock &MBB,
const DebugLoc &DL, Register DestReg,
Register SrcReg, StackOffset Offset,
MachineInstr::MIFlag Flag,
- MaybeAlign RequiredAlign) const {
+ MaybeAlign RequiredAlign,
+ bool IsPrologueOrEpilogue) const {
if (DestReg == SrcReg && !Offset.getFixed() && !Offset.getScalable())
return;
@@ -205,21 +206,43 @@ void RISCVRegisterInfo::adjustReg(MachineBasicBlock &MBB,
assert(isInt<32>(ScalableValue / (RISCV::RVVBitsPerBlock / 8)) &&
"Expect the number of vector registers within 32-bits.");
uint32_t NumOfVReg = ScalableValue / (RISCV::RVVBitsPerBlock / 8);
- BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), ScratchReg)
- .setMIFlag(Flag);
-
- if (ScalableAdjOpc == RISCV::ADD && ST.hasStdExtZba() &&
- (NumOfVReg == 2 || NumOfVReg == 4 || NumOfVReg == 8)) {
- unsigned Opc = NumOfVReg == 2 ? RISCV::SH1ADD :
- (NumOfVReg == 4 ? RISCV::SH2ADD : RISCV::SH3ADD);
- BuildMI(MBB, II, DL, TII->get(Opc), DestReg)
- .addReg(ScratchReg, RegState::Kill).addReg(SrcReg)
+ // Only use vsetvli rather than vlenb if adjusting in the prologue or
+ // epilogue, otherwise it may distrube the VTYPE and VL status.
+ bool UseVsetvliRatherThanVlenb = IsPrologueOrEpilogue;
+ if (UseVsetvliRatherThanVlenb && (NumOfVReg == 1 || NumOfVReg == 2 ||
+ NumOfVReg == 4 || NumOfVReg == 8)) {
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadMulVLENB), ScratchReg)
+ .addImm(NumOfVReg)
.setMIFlag(Flag);
- } else {
- TII->mulImm(MF, MBB, II, DL, ScratchReg, NumOfVReg, Flag);
BuildMI(MBB, II, DL, TII->get(ScalableAdjOpc), DestReg)
- .addReg(SrcReg).addReg(ScratchReg, RegState::Kill)
+ .addReg(SrcReg)
+ .addReg(ScratchReg, RegState::Kill)
.setMIFlag(Flag);
+ } else {
+ if (UseVsetvliRatherThanVlenb)
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadMulVLENB), ScratchReg)
+ .addImm(1)
+ .setMIFlag(Flag);
+ else
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), ScratchReg)
+ .setMIFlag(Flag);
+
+ if (ScalableAdjOpc == RISCV::ADD && ST.hasStdExtZba() &&
+ (NumOfVReg == 2 || NumOfVReg == 4 || NumOfVReg == 8)) {
+ unsigned Opc = NumOfVReg == 2
+ ? RISCV::SH1ADD
+ : (NumOfVReg == 4 ? RISCV::SH2ADD : RISCV::SH3ADD);
+ BuildMI(MBB, II, DL, TII->get(Opc), DestReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .addReg(SrcReg)
+ .setMIFlag(Flag);
+ } else {
+ TII->mulImm(MF, MBB, II, DL, ScratchReg, NumOfVReg, Flag);
+ BuildMI(MBB, II, DL, TII->get(ScalableAdjOpc), DestReg)
+ .addReg(SrcReg)
+ .addReg(ScratchReg, RegState::Kill)
+ .setMIFlag(Flag);
+ }
}
SrcReg = DestReg;
KillSrcReg = true;
@@ -526,7 +549,8 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
else
DestReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
adjustReg(*II->getParent(), II, DL, DestReg, FrameReg, Offset,
- MachineInstr::NoFlags, std::nullopt);
+ MachineInstr::NoFlags, std::nullopt,
+ /*IsPrologueOrEpilogue*/ false);
MI.getOperand(FIOperandNum).ChangeToRegister(DestReg, /*IsDef*/false,
/*IsImp*/false,
/*IsKill*/true);
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 6ddb1eb9c14d5e..b7aa120935747a 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -72,10 +72,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
// used during frame layout, and we may need to ensure that if we
// split the offset internally that the DestReg is always aligned,
// assuming that source reg was.
+ // If IsPrologueOrEpilogue is true, the function is called during prologue
+ // or epilogue generation.
void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator II,
const DebugLoc &DL, Register DestReg, Register SrcReg,
StackOffset Offset, MachineInstr::MIFlag Flag,
- MaybeAlign RequiredAlign) const;
+ MaybeAlign RequiredAlign, bool IsPrologueOrEpilogue) const;
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll b/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll
index 70cdb6cec2449f..28a4d9166b1ef9 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll
@@ -11,8 +11,7 @@ define void @bar() nounwind {
; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 96
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: andi sp, sp, -64
; CHECK-NEXT: mv s1, sp
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index 0c2b809c0be20c..3704d0a5e20edb 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -16,7 +16,7 @@ define void @_Z3foov() {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: sub sp, sp, a0
@@ -82,7 +82,7 @@ define void @_Z3foov() {
; CHECK-NEXT: lui a0, %hi(var_47)
; CHECK-NEXT: addi a0, a0, %lo(var_47)
; CHECK-NEXT: vsseg4e16.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 8116d138d288e2..cc426ce3cad1a1 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -59,8 +59,7 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
; RV32-NEXT: addi a0, sp, 32
@@ -97,8 +96,7 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
; RV32-NEXT: sub a1, a1, a4
; RV32-NEXT: sub a1, a1, a3
; RV32-NEXT: sub a0, a0, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: vsetvli a2, zero, e8, m2, ta, ma
; RV32-NEXT: add sp, sp, a2
; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 7084c04805be72..3a62d8c2980802 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -35,7 +35,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: .cfi_offset s9, -88
; NOREMAT-NEXT: .cfi_offset s10, -96
; NOREMAT-NEXT: .cfi_offset s11, -104
-; NOREMAT-NEXT: csrr a2, vlenb
+; NOREMAT-NEXT: vsetvli a2, zero, e8, m1, ta, ma
; NOREMAT-NEXT: li a3, 6
; NOREMAT-NEXT: mul a2, a2, a3
; NOREMAT-NEXT: sub sp, sp, a2
@@ -759,7 +759,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vse32.v v8, (a0)
; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT: csrr a0, vlenb
+; NOREMAT-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; NOREMAT-NEXT: li a1, 6
; NOREMAT-NEXT: mul a0, a0, a1
; NOREMAT-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
index 6a0dbbe356a165..8708f766130c6a 100644
--- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
+++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
@@ -19,7 +19,7 @@ define void @last_chance_recoloring_failure() {
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset ra, -8
; CHECK-NEXT: .cfi_offset s0, -16
-; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb
@@ -59,7 +59,7 @@ define void @last_chance_recoloring_failure() {
; CHECK-NEXT: vsetvli zero, zero, e32, m8, tu, mu
; CHECK-NEXT: vfdiv.vv v8, v24, v8, v0.t
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -75,7 +75,7 @@ define void @last_chance_recoloring_failure() {
; SUBREGLIVENESS-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; SUBREGLIVENESS-NEXT: .cfi_offset ra, -8
; SUBREGLIVENESS-NEXT: .cfi_offset s0, -16
-; SUBREGLIVENESS-NEXT: csrr a0, vlenb
+; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; SUBREGLIVENESS-NEXT: slli a0, a0, 4
; SUBREGLIVENESS-NEXT: sub sp, sp, a0
; SUBREGLIVENESS-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb
@@ -115,7 +115,7 @@ define void @last_chance_recoloring_failure() {
; SUBREGLIVENESS-NEXT: vsetvli zero, zero, e32, m8, tu, mu
; SUBREGLIVENESS-NEXT: vfdiv.vv v8, v24, v8, v0.t
; SUBREGLIVENESS-NEXT: vse32.v v8, (a0)
-; SUBREGLIVENESS-NEXT: csrr a0, vlenb
+; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; SUBREGLIVENESS-NEXT: slli a0, a0, 4
; SUBREGLIVENESS-NEXT: add sp, sp, a0
; SUBREGLIVENESS-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
index 225680e846bac7..af88e39f18e195 100644
--- a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
@@ -9,7 +9,7 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
; OMIT-FP: # %bb.0: # %entry
; OMIT-FP-NEXT: addi sp, sp, -16
; OMIT-FP-NEXT: .cfi_def_cfa_offset 16
-; OMIT-FP-NEXT: csrr a0, vlenb
+; OMIT-FP-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; OMIT-FP-NEXT: slli a1, a0, 3
; OMIT-FP-NEXT: sub a0, a1, a0
; OMIT-FP-NEXT: sub sp, sp, a0
@@ -49,7 +49,7 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
; OMIT-FP-NEXT: vl2r.v v2, (a0) # Unknown-size Folded Reload
; OMIT-FP-NEXT: addi a0, sp, 16
; OMIT-FP-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload
-; OMIT-FP-NEXT: csrr a0, vlenb
+; OMIT-FP-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; OMIT-FP-NEXT: slli a1, a0, 3
; OMIT-FP-NEXT: sub a0, a1, a0
; OMIT-FP-NEXT: add sp, sp, a0
@@ -66,7 +66,7 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
; NO-OMIT-FP-NEXT: .cfi_offset s0, -16
; NO-OMIT-FP-NEXT: addi s0, sp, 32
; NO-OMIT-FP-NEXT: .cfi_def_cfa s0, 0
-; NO-OMIT-FP-NEXT: csrr a0, vlenb
+; NO-OMIT-FP-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; NO-OMIT-FP-NEXT: slli a1, a0, 3
; NO-OMIT-FP-NEXT: sub a0, a1, a0
; NO-OMIT-FP-NEXT: sub sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index cd2208e31eb6d3..3a8100c57b26f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -563,7 +563,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
; CHECK: # %b...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still profitable even for cases when there's no shift needed, e.g. would a vsetvli be better than a single csrr?
Does this work with shrink wrapping where the prologue might not be at the beginning? |
CSR reads in general are seializing. Vlenb needs to be special cased in the microarchitecture. Some versions of SiFive cores missed this optimization. Have we checked BananaPi? |
1 similar comment
CSR reads in general are seializing. Vlenb needs to be special cased in the microarchitecture. Some versions of SiFive cores missed this optimization. Have we checked BananaPi? |
I did some experimenting and it looks like the F3 is also missing the optimisation. csrr.S: .global start
start:
li a0, 0
li a1, 10485760
loop:
csrr t0, COUNTER
addi a0, a0, 1
blt a0, a1, loop
exit:
li a7, 93
ecall vsetvli.s: .global _start
_start:
li a0, 0
li a1, 10485760
loop:
vsetvli t0, zero, e8, m1, ta, ma
addi a0, a0, 1
blt a0, a1, loop
exit:
li a7, 93
ecall
I guess this could also introduce VL/VTYPE toggles, but I don't have any data as to whether or not that would be an issue in practice given that this is restricted to prologues and epilogues. I also don't know how expensive a VL/VTYPE toggle is in comparison to a CSR read. Maybe it's always worthwhile to avoid the csrr. |
Put few more check in |
clang and gcc usingc csrr vlenb for prolog/epilog code was an amazing unintended feature (Hyrum's Law). Searching the generated assembly for the string "vlenb" is currently the easiest way to identify register spills when compiling intrinsics. You could do that in a big codebases and have minimal false positives. I understand that we should try to get the best codegen, and there will be implementations where csrr is slower than vsetvli. |
Do you know of ooo implementations implementing a predictor? |
Yes I think so. Steam Computing open-sourced an ooo implementation with CSR speculation on top of BOOM at RISC-V Summit China: https://github.com/riscv-stc/riscv-boom/tree/matrix Their default configuration seems to have 8 entries for vconfig speculation: I was not able to build it with verilator and contacted the author, who said that they only support vcs, which I don't have access to. Edit: actually, this might just do speculation, but that also requires keeping track of multiple vtypes I would hope there are a lot of proprietary cores with vtype speculation currently in development as well. |
I thought you meant predicting the VL/VTYPE without waiting for the scalar instructions to compute the AVL. This looks like it just allowing it to speculatively execute across branches that might mispredict. |
I think we should make it a feature. For XiangShan, IIUC, |
I've considered adding an option or target feature before, but I decided not to include it in the patch in the end. I know that So, why do we still try to replace XiangShan optimized But I do agree that we should have a target feature IF we try to replace |
Reverse ping, does GCC currently do this? I think it would be nice to have it as a tuning option for spacemit-x60 + sifive |
Currently, we use `csrr` with `vlenb` to obtain the `VLEN`, but this is not the only option. We can also use `vsetvli` with `e8`/`m1` to get `VLENMAX`, which is equal to the VLEN. This method is preferable on some microarchitectures and makes it easier to obtain values like `VLEN * 2`, `VLEN * 4`, or `VLEN * 8`, reducing the number of instructions needed to calculate VLEN multiples. However, this approach is *NOT* always interchangeable, as it changes the state of `VTYPE` and `VL`, which can alter the behavior of vector instructions, potentially causing incorrect code generation if applied after a vsetvli insertion. Therefore, we limit its use to the prologue/epilogue for now, as there are no vector operations within the prologue/epilogue sequence. With further analysis, we may extend this approach beyond the prologue/epilogue in the future, but starting here should be a good first step. This feature is gurded by the `+prefer-vsetvli-over-read-vlenb` feature, which is disabled by default for now.
84036d4
to
7149518
Compare
Changes:
|
: SubtargetFeature<"prefer-vsetvli-over-read-vlenb", | ||
"PreferVsetvliOverReadVLENB", | ||
"true", | ||
"Prefer vsetvli over read vlenb CSR when calculate VLEN">; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when calculate
-> to calculate
// Make sure VTYPE and VL are not live-in since we will use vsetvli in the | ||
// prologue to get the VLEN, and that will clobber these registers. | ||
// | ||
// We may do also check the stack has contain for the object with the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"has contain for the object with the " -> "contains objects with"
Changes:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Changes:
|
Will wait one more day before merge to see if any further comments |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Changes:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/minor suggestion.
As a follow up, I think we should consider a) enabling this by default given a large fraction of existing hardware benefits and we don't have anyone raising concerns about regressions on the rest and b) extending this for non-power-of-twos by using mulimm on the resulting value.
Currently, we use
csrr
withvlenb
to obtain theVLEN
, but this is not the only option. We can also usevsetvli
withe8
/m1
to getVLENMAX
, which is equal to the VLEN. This method is preferable on some microarchitectures and makes it easier to obtain values likeVLEN * 2
,VLEN * 4
, orVLEN * 8
, reducing the number of instructions needed to calculate VLEN multiples.However, this approach is NOT always interchangeable, as it changes the state of
VTYPE
andVL
, which can alter the behavior of vector instructions, potentially causing incorrect code generation if applied after a vsetvli insertion. Therefore, we limit its use to the prologue/epilogue for now, as there are no vector operations within the prologue/epilogue sequence.With further analysis, we may extend this approach beyond the prologue/epilogue in the future, but starting here should be a good first step.
This feature is gurded by the
+prefer-vsetvli-over-read-vlenb
feature, which is disabled by default for now.