Skip to content

[WebAssembly] Enable a limited amount of stackification for debug code #136510

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/WebAssembly/WebAssembly.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ FunctionPass *createWebAssemblyReplacePhysRegs();
FunctionPass *createWebAssemblyNullifyDebugValueLists();
FunctionPass *createWebAssemblyOptimizeLiveIntervals();
FunctionPass *createWebAssemblyMemIntrinsicResults();
FunctionPass *createWebAssemblyRegStackify();
FunctionPass *createWebAssemblyRegStackify(CodeGenOptLevel OptLevel);
FunctionPass *createWebAssemblyRegColoring();
FunctionPass *createWebAssemblyFixBrTableDefaults();
FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
Expand Down
138 changes: 89 additions & 49 deletions llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,18 @@ using namespace llvm;

namespace {
class WebAssemblyRegStackify final : public MachineFunctionPass {
bool Optimize;

StringRef getPassName() const override {
return "WebAssembly Register Stackify";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<LiveIntervalsWrapperPass>();
if (Optimize) {
AU.addRequired<LiveIntervalsWrapperPass>();
AU.addRequired<MachineDominatorTreeWrapperPass>();
}
AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
Expand All @@ -61,7 +65,9 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {

public:
static char ID; // Pass identification, replacement for typeid
WebAssemblyRegStackify() : MachineFunctionPass(ID) {}
WebAssemblyRegStackify(CodeGenOptLevel OptLevel)
: MachineFunctionPass(ID), Optimize(OptLevel != CodeGenOptLevel::None) {}
WebAssemblyRegStackify() : WebAssemblyRegStackify(CodeGenOptLevel::Default) {}
};
} // end anonymous namespace

Expand All @@ -70,8 +76,8 @@ INITIALIZE_PASS(WebAssemblyRegStackify, DEBUG_TYPE,
"Reorder instructions to use the WebAssembly value stack",
false, false)

FunctionPass *llvm::createWebAssemblyRegStackify() {
return new WebAssemblyRegStackify();
FunctionPass *llvm::createWebAssemblyRegStackify(CodeGenOptLevel OptLevel) {
return new WebAssemblyRegStackify(OptLevel);
}

// Decorate the given instruction with implicit operands that enforce the
Expand All @@ -96,8 +102,7 @@ static void imposeStackOrdering(MachineInstr *MI) {
static void convertImplicitDefToConstZero(MachineInstr *MI,
MachineRegisterInfo &MRI,
const TargetInstrInfo *TII,
MachineFunction &MF,
LiveIntervals &LIS) {
MachineFunction &MF) {
assert(MI->getOpcode() == TargetOpcode::IMPLICIT_DEF);

const auto *RegClass = MRI.getRegClass(MI->getOperand(0).getReg());
Expand Down Expand Up @@ -262,36 +267,53 @@ static bool shouldRematerialize(const MachineInstr &Def,
// LiveIntervals to handle complex cases.
static MachineInstr *getVRegDef(unsigned Reg, const MachineInstr *Insert,
const MachineRegisterInfo &MRI,
const LiveIntervals &LIS) {
const LiveIntervals *LIS) {
// Most registers are in SSA form here so we try a quick MRI query first.
if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg))
return Def;

// MRI doesn't know what the Def is. Try asking LIS.
if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
LIS.getInstructionIndex(*Insert)))
return LIS.getInstructionFromIndex(ValNo->def);
if (LIS != nullptr) {
SlotIndex InstIndex = LIS->getInstructionIndex(*Insert);
if (const VNInfo *ValNo = LIS->getInterval(Reg).getVNInfoBefore(InstIndex))
return LIS->getInstructionFromIndex(ValNo->def);
}

return nullptr;
}

// Test whether Reg, as defined at Def, has exactly one use. This is a
// generalization of MachineRegisterInfo::hasOneNonDBGUse that uses
// LiveIntervals to handle complex cases.
static bool hasOneNonDBGUse(unsigned Reg, MachineInstr *Def,
MachineRegisterInfo &MRI, MachineDominatorTree &MDT,
LiveIntervals &LIS) {
// LiveIntervals to handle complex cases in optimized code.
static bool hasSingleUse(unsigned Reg, MachineRegisterInfo &MRI,
WebAssemblyFunctionInfo &MFI, bool Optimize,
MachineInstr *Def, LiveIntervals *LIS) {
if (!Optimize) {
// Using "hasOneUse" instead of "hasOneNonDBGUse" here because we don't
// want to stackify DBG_VALUE operands - WASM stack locations are less
// useful and less widely supported than WASM local locations.
if (!MRI.hasOneUse(Reg))
return false;
// The frame base always has an implicit DBG use as DW_AT_frame_base.
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
return false;
return true;
}

// Most registers are in SSA form here so we try a quick MRI query first.
if (MRI.hasOneNonDBGUse(Reg))
return true;

if (LIS == nullptr)
return false;

bool HasOne = false;
const LiveInterval &LI = LIS.getInterval(Reg);
const LiveInterval &LI = LIS->getInterval(Reg);
const VNInfo *DefVNI =
LI.getVNInfoAt(LIS.getInstructionIndex(*Def).getRegSlot());
LI.getVNInfoAt(LIS->getInstructionIndex(*Def).getRegSlot());
assert(DefVNI);
for (auto &I : MRI.use_nodbg_operands(Reg)) {
const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent()));
const auto &Result = LI.Query(LIS->getInstructionIndex(*I.getParent()));
if (Result.valueIn() == DefVNI) {
if (!Result.isKill())
return false;
Expand All @@ -311,7 +333,7 @@ static bool hasOneNonDBGUse(unsigned Reg, MachineInstr *Def,
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
const MachineInstr *Insert,
const WebAssemblyFunctionInfo &MFI,
const MachineRegisterInfo &MRI) {
const MachineRegisterInfo &MRI, bool Optimize) {
const MachineInstr *DefI = Def->getParent();
const MachineInstr *UseI = Use->getParent();
assert(DefI->getParent() == Insert->getParent());
Expand Down Expand Up @@ -357,6 +379,12 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
if (NextI == Insert)
return true;

// When not optimizing, we only handle the trivial case above
// to guarantee no impact to debugging and to avoid spending
// compile time.
if (!Optimize)
return false;

// 'catch' and 'catch_all' should be the first instruction of a BB and cannot
// move.
if (WebAssembly::isCatch(DefI->getOpcode()))
Expand Down Expand Up @@ -520,14 +548,15 @@ static void shrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
/// dependencies; move the def down and nest it with the current instruction.
static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
MachineInstr *Def, MachineBasicBlock &MBB,
MachineInstr *Insert, LiveIntervals &LIS,
MachineInstr *Insert, LiveIntervals *LIS,
WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI) {
LLVM_DEBUG(dbgs() << "Move for single use: "; Def->dump());

WebAssemblyDebugValueManager DefDIs(Def);
DefDIs.sink(Insert);
LIS.handleMove(*Def);
if (LIS != nullptr)
LIS->handleMove(*Def);

if (MRI.hasOneDef(Reg) && MRI.hasOneNonDBGUse(Reg)) {
// No one else is using this register for anything so we can just stackify
Expand All @@ -540,17 +569,18 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
Op.setReg(NewReg);
DefDIs.updateReg(NewReg);

// Tell LiveIntervals about the new register.
LIS.createAndComputeVirtRegInterval(NewReg);
if (LIS != nullptr) {
// Tell LiveIntervals about the new register.
LIS->createAndComputeVirtRegInterval(NewReg);

// Tell LiveIntervals about the changes to the old register.
LiveInterval &LI = LIS.getInterval(Reg);
LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(),
LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
/*RemoveDeadValNo=*/true);
// Tell LiveIntervals about the changes to the old register.
LiveInterval &LI = LIS->getInterval(Reg);
LI.removeSegment(LIS->getInstructionIndex(*Def).getRegSlot(),
LIS->getInstructionIndex(*Op.getParent()).getRegSlot(),
/*RemoveDeadValNo=*/true);
}

MFI.stackifyVReg(MRI, NewReg);

LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
}

Expand All @@ -567,11 +597,12 @@ static MachineInstr *getPrevNonDebugInst(MachineInstr *MI) {

/// A trivially cloneable instruction; clone it and nest the new copy with the
/// current instruction.
static MachineInstr *rematerializeCheapDef(
unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
static MachineInstr *
rematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr &Def,
MachineBasicBlock::instr_iterator Insert,
LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI,
const WebAssemblyInstrInfo *TII) {
LLVM_DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
LLVM_DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());

Expand Down Expand Up @@ -811,9 +842,12 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
MachineDominatorTree *MDT = nullptr;
LiveIntervals *LIS = nullptr;
if (Optimize) {
MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
}

// Walk the instructions from the bottom up. Currently we don't look past
// block boundaries, and the blocks aren't ordered so the block visitation
Expand Down Expand Up @@ -876,23 +910,28 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// supports intra-block moves) and it's MachineSink's job to catch all
// the sinking opportunities anyway.
bool SameBlock = DefI->getParent() == &MBB;
bool CanMove = SameBlock && isSafeToMove(Def, &Use, Insert, MFI, MRI) &&
bool CanMove = SameBlock &&
isSafeToMove(Def, &Use, Insert, MFI, MRI, Optimize) &&
!TreeWalker.isOnStack(Reg);
if (CanMove && hasOneNonDBGUse(Reg, DefI, MRI, MDT, LIS)) {
if (CanMove && hasSingleUse(Reg, MRI, MFI, Optimize, DefI, LIS)) {
Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI);

// If we are removing the frame base reg completely, remove the debug
// info as well.
// TODO: Encode this properly as a stackified value.
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg) {
assert(
Optimize &&
"Stackifying away frame base in unoptimized code not expected");
MFI.clearFrameBaseVreg();
} else if (shouldRematerialize(*DefI, TII)) {
Insert =
rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(),
LIS, MFI, MRI, TII, TRI);
} else if (CanMove && oneUseDominatesOtherUses(Reg, Use, MBB, MRI, MDT,
LIS, MFI)) {
Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, LIS, MFI,
}
} else if (Optimize && shouldRematerialize(*DefI, TII)) {
Insert = rematerializeCheapDef(Reg, Use, *DefI, Insert->getIterator(),
*LIS, MFI, MRI, TII);
} else if (Optimize && CanMove &&
oneUseDominatesOtherUses(Reg, Use, MBB, MRI, *MDT, *LIS,
MFI)) {
Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, *LIS, MFI,
MRI, TII);
} else {
// We failed to stackify the operand. If the problem was ordering
Expand All @@ -915,7 +954,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
Register DefReg = SubsequentDef->getReg();
Register UseReg = SubsequentUse->getReg();
// TODO: This single-use restriction could be relaxed by using tees
if (DefReg != UseReg || !MRI.hasOneNonDBGUse(DefReg))
if (DefReg != UseReg ||
!hasSingleUse(DefReg, MRI, MFI, Optimize, nullptr, nullptr))
break;
MFI.stackifyVReg(MRI, DefReg);
++SubsequentDef;
Expand All @@ -926,7 +966,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// to a constant 0 so that the def is explicit, and the push/pop
// correspondence is maintained.
if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF)
convertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS);
convertImplicitDefToConstZero(Insert, MRI, TII, MF);

// We stackified an operand. Add the defining instruction's operands to
// the worklist stack now to continue to build an ever deeper tree.
Expand Down
14 changes: 8 additions & 6 deletions llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -603,14 +603,16 @@ void WebAssemblyPassConfig::addPreEmitPass() {

// Prepare memory intrinsic calls for register stackifying.
addPass(createWebAssemblyMemIntrinsicResults());
}

// Mark registers as representing wasm's value stack. This is a key
// code-compression technique in WebAssembly. We run this pass (and
// MemIntrinsicResults above) very late, so that it sees as much code as
// possible, including code emitted by PEI and expanded by late tail
// duplication.
addPass(createWebAssemblyRegStackify());
// Mark registers as representing wasm's value stack. This is a key
// code-compression technique in WebAssembly. We run this pass (and
// MemIntrinsicResults above) very late, so that it sees as much code as
// possible, including code emitted by PEI and expanded by late tail
// duplication.
addPass(createWebAssemblyRegStackify(getOptLevel()));

if (getOptLevel() != CodeGenOptLevel::None) {
// Run the register coloring pass to reduce the total number of registers.
// This runs after stackification so that it doesn't consider registers
// that become stackified.
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/WebAssembly/PR40172.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ target triple = "wasm32-unknown-unknown"

; CHECK: i32.sub $[[BASE:[0-9]+]]=,
; CHECK: local.copy $[[ARG:[0-9]+]]=, $0{{$}}
; CHECK: i32.const $[[A0:[0-9]+]]=, 1{{$}}
; CHECK: i32.and $[[A1:[0-9]+]]=, $[[ARG]], $[[A0]]{{$}}
; CHECK: i32.store8 8($[[BASE]]), $[[A1]]{{$}}
; CHECK: i32.const $push[[A0:[0-9]+]]=, 1{{$}}
; CHECK: i32.and $push[[A1:[0-9]+]]=, $[[ARG]], $pop[[A0]]{{$}}
; CHECK: i32.store8 8($[[BASE]]), $pop[[A1]]{{$}}

define void @test(i8 %byte) {
%t = alloca { i8, i8 }, align 8
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/WebAssembly/PR41841.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ declare void @foo(i128)

; CHECK-LABEL: test_zext:
; CHECK-NEXT: .functype test_zext (i32) -> (){{$}}
; CHECK-NEXT: i64.extend_i32_u $[[TMP3:[0-9]+]]=, $0{{$}}
; CHECK-NEXT: i64.const $[[TMP4:[0-9]+]]=, 1{{$}}
; CHECK-NEXT: i64.and $[[TMP1:[0-9]+]]=, $[[TMP3]], $[[TMP4]]{{$}}
; CHECK-NEXT: i64.extend_i32_u $push[[TMP3:[0-9]+]]=, $0{{$}}
; CHECK-NEXT: i64.const $push[[TMP4:[0-9]+]]=, 1{{$}}
; CHECK-NEXT: i64.and $[[TMP1:[0-9]+]]=, $pop[[TMP3]], $pop[[TMP4]]{{$}}
; CHECK-NEXT: i64.const $[[TMP2:[0-9]+]]=, 0{{$}}
; CHECK-NEXT: call foo, $[[TMP1]], $[[TMP2]]{{$}}
; CHECK-NEXT: return{{$}}
Expand All @@ -23,11 +23,11 @@ next: ; preds = %start

; CHECK-LABEL: test_sext:
; CHECK-NEXT:.functype test_sext (i32) -> (){{$}}
; CHECK-NEXT: i64.extend_i32_u $[[TMP3:[0-9]+]]=, $0{{$}}
; CHECK-NEXT: i64.const $[[TMP4:[0-9]+]]=, 1{{$}}
; CHECK-NEXT: i64.and $[[TMP5:[0-9]+]]=, $[[TMP3]], $[[TMP4]]{{$}}
; CHECK-NEXT: i64.const $[[TMP6:[0-9]+]]=, 0{{$}}
; CHECK-NEXT: i64.sub $[[TMP1:[0-9]+]]=, $[[TMP6]], $[[TMP5]]{{$}}
; CHECK-NEXT: i64.extend_i32_u $push[[TMP3:[0-9]+]]=, $0{{$}}
; CHECK-NEXT: i64.const $push[[TMP4:[0-9]+]]=, 1{{$}}
; CHECK-NEXT: i64.and $[[TMP5:[0-9]+]]=, $pop[[TMP3]], $pop[[TMP4]]{{$}}
; CHECK-NEXT: i64.const $push[[TMP6:[0-9]+]]=, 0{{$}}
; CHECK-NEXT: i64.sub $[[TMP1:[0-9]+]]=, $pop[[TMP6]], $[[TMP5]]{{$}}
; CHECK-NEXT: local.copy $[[TMP2:[0-9]+]]=, $[[TMP1]]{{$}}
; CHECK-NEXT: call foo, $[[TMP1]], $[[TMP2]]{{$}}
; CHECK-NEXT: return{{$}}
Expand Down
Loading
Loading