Skip to content

Commit d1a0605

Browse files
authored
[X86][CodeGen] Support using NF instructions for flag copy lowering (#93508)
1 parent 627463d commit d1a0605

File tree

3 files changed

+584
-18
lines changed

3 files changed

+584
-18
lines changed

llvm/lib/Target/X86/X86FlagsCopyLowering.cpp

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
6565
STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
6666
STATISTIC(NumTestsInserted, "Number of test instructions inserted");
6767
STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
68+
STATISTIC(NumNFsConvertedTo, "Number of NF instructions converted to");
6869

6970
namespace {
7071

@@ -235,6 +236,19 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
235236
return NewMBB;
236237
}
237238

239+
enum EFLAGSClobber { NoClobber, EvitableClobber, InevitableClobber };
240+
241+
static EFLAGSClobber getClobberType(const MachineInstr &MI) {
242+
const MachineOperand *FlagDef =
243+
MI.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
244+
if (!FlagDef)
245+
return NoClobber;
246+
if (FlagDef->isDead() && X86::getNFVariant(MI.getOpcode()))
247+
return EvitableClobber;
248+
249+
return InevitableClobber;
250+
}
251+
238252
bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
239253
LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
240254
<< " **********\n");
@@ -254,14 +268,107 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
254268
// turn copied again we visit the first one first. This ensures we can find
255269
// viable locations for testing the original EFLAGS that dominate all the
256270
// uses across complex CFGs.
257-
SmallVector<MachineInstr *, 4> Copies;
271+
SmallSetVector<MachineInstr *, 4> Copies;
258272
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
259273
for (MachineBasicBlock *MBB : RPOT)
260274
for (MachineInstr &MI : *MBB)
261275
if (MI.getOpcode() == TargetOpcode::COPY &&
262276
MI.getOperand(0).getReg() == X86::EFLAGS)
263-
Copies.push_back(&MI);
277+
Copies.insert(&MI);
278+
279+
// Try to elminate the copys by transform the instructions between copy and
280+
// copydef to the NF (no flags update) variants, e.g.
281+
//
282+
// %1:gr64 = COPY $eflags
283+
// OP1 implicit-def dead $eflags
284+
// $eflags = COPY %1
285+
// OP2 cc, implicit $eflags
286+
//
287+
// ->
288+
//
289+
// OP1_NF
290+
// OP2 implicit $eflags
291+
if (Subtarget->hasNF()) {
292+
SmallSetVector<MachineInstr *, 4> RemovedCopies;
293+
// CopyIIt may be invalidated by removing copies.
294+
auto CopyIIt = Copies.begin(), CopyIEnd = Copies.end();
295+
while (CopyIIt != CopyIEnd) {
296+
auto NCopyIIt = std::next(CopyIIt);
297+
SmallSetVector<MachineInstr *, 4> EvitableClobbers;
298+
MachineInstr *CopyI = *CopyIIt;
299+
MachineOperand &VOp = CopyI->getOperand(1);
300+
MachineInstr *CopyDefI = MRI->getVRegDef(VOp.getReg());
301+
MachineBasicBlock *CopyIMBB = CopyI->getParent();
302+
MachineBasicBlock *CopyDefIMBB = CopyDefI->getParent();
303+
// Walk all basic blocks reachable in depth-first iteration on the inverse
304+
// CFG from CopyIMBB to CopyDefIMBB. These blocks are all the blocks that
305+
// may be executed between the execution of CopyDefIMBB and CopyIMBB. On
306+
// all execution paths, instructions from CopyDefI to CopyI (exclusive)
307+
// has to be NF-convertible if it clobbers flags.
308+
for (auto BI = idf_begin(CopyIMBB), BE = idf_end(CopyDefIMBB); BI != BE;
309+
++BI) {
310+
MachineBasicBlock *MBB = *BI;
311+
for (auto I = (MBB != CopyDefIMBB)
312+
? MBB->begin()
313+
: std::next(MachineBasicBlock::iterator(CopyDefI)),
314+
E = (MBB != CopyIMBB) ? MBB->end()
315+
: MachineBasicBlock::iterator(CopyI);
316+
I != E; ++I) {
317+
MachineInstr &MI = *I;
318+
EFLAGSClobber ClobberType = getClobberType(MI);
319+
if (ClobberType == NoClobber)
320+
continue;
321+
322+
if (ClobberType == InevitableClobber)
323+
goto ProcessNextCopyI;
324+
325+
assert(ClobberType == EvitableClobber && "unexpected workflow");
326+
EvitableClobbers.insert(&MI);
327+
}
328+
}
329+
// Covert evitable clobbers into NF variants and remove the copyies.
330+
RemovedCopies.insert(CopyI);
331+
CopyI->eraseFromParent();
332+
if (MRI->use_nodbg_empty(CopyDefI->getOperand(0).getReg())) {
333+
RemovedCopies.insert(CopyDefI);
334+
CopyDefI->eraseFromParent();
335+
}
336+
++NumCopiesEliminated;
337+
for (auto *Clobber : EvitableClobbers) {
338+
unsigned NewOpc = X86::getNFVariant(Clobber->getOpcode());
339+
assert(NewOpc && "evitable clobber must have a NF variant");
340+
Clobber->setDesc(TII->get(NewOpc));
341+
Clobber->removeOperand(
342+
Clobber->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr)
343+
->getOperandNo());
344+
++NumNFsConvertedTo;
345+
}
346+
// Update liveins for basic blocks in the path
347+
for (auto BI = idf_begin(CopyIMBB), BE = idf_end(CopyDefIMBB); BI != BE;
348+
++BI)
349+
if (*BI != CopyDefIMBB)
350+
BI->addLiveIn(X86::EFLAGS);
351+
ProcessNextCopyI:
352+
CopyIIt = NCopyIIt;
353+
}
354+
Copies.set_subtract(RemovedCopies);
355+
}
264356

357+
// For the rest of copies that cannot be eliminated by NF transform, we use
358+
// setcc to preserve the flags in GPR32 before OP1, and recheck its value
359+
// before using the flags, e.g.
360+
//
361+
// %1:gr64 = COPY $eflags
362+
// OP1 implicit-def dead $eflags
363+
// $eflags = COPY %1
364+
// OP2 cc, implicit $eflags
365+
//
366+
// ->
367+
//
368+
// %1:gr8 = SETCCr cc, implicit $eflags
369+
// OP1 implicit-def dead $eflags
370+
// TEST8rr %1, %1, implicit-def $eflags
371+
// OP2 ne, implicit $eflags
265372
for (MachineInstr *CopyI : Copies) {
266373
MachineBasicBlock &MBB = *CopyI->getParent();
267374

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+nf | FileCheck %s
3+
4+
define i32 @flag_copy_1(i32 %x, i32 %y, ptr %pz) nounwind {
5+
; CHECK-LABEL: flag_copy_1:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: movq %rdx, %rcx
8+
; CHECK-NEXT: movl %edi, %eax
9+
; CHECK-NEXT: mull %esi
10+
; CHECK-NEXT: movl (%rcx), %ecx
11+
; CHECK-NEXT: {nf} addl %eax, %ecx
12+
; CHECK-NEXT: cmovol %ecx, %eax
13+
; CHECK-NEXT: retq
14+
%o = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
15+
%v1 = extractvalue { i32, i1 } %o, 1
16+
%v2 = extractvalue { i32, i1 } %o, 0
17+
%z = load i32, ptr %pz
18+
%a = add i32 %v2, %z
19+
%r = select i1 %v1, i32 %a, i32 %v2
20+
ret i32 %r
21+
}
22+
23+
declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>)
24+
25+
define <2 x i128> @flag_copy_2(<2 x i128> %x, <2 x i128> %y) nounwind {
26+
; CHECK-LABEL: flag_copy_2:
27+
; CHECK: # %bb.0:
28+
; CHECK-NEXT: movq %rdi, %rax
29+
; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rcx
30+
; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
31+
; CHECK-NEXT: movq %r8, %rdi
32+
; CHECK-NEXT: {nf} sarq $63, %rdi
33+
; CHECK-NEXT: cmovoq %rdi, %rcx
34+
; CHECK-NEXT: movabsq $-9223372036854775808, %r9 # imm = 0x8000000000000000
35+
; CHECK-NEXT: {nf} xorq %r9, %rdi
36+
; CHECK-NEXT: cmovnoq %r8, %rdi
37+
; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rsi
38+
; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
39+
; CHECK-NEXT: movq %rdx, %r8
40+
; CHECK-NEXT: {nf} sarq $63, %r8
41+
; CHECK-NEXT: cmovoq %r8, %rsi
42+
; CHECK-NEXT: {nf} xorq %r9, %r8
43+
; CHECK-NEXT: cmovnoq %rdx, %r8
44+
; CHECK-NEXT: movq %rcx, 16(%rax)
45+
; CHECK-NEXT: movq %rsi, (%rax)
46+
; CHECK-NEXT: movq %rdi, 24(%rax)
47+
; CHECK-NEXT: movq %r8, 8(%rax)
48+
; CHECK-NEXT: retq
49+
%z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
50+
ret <2 x i128> %z
51+
}
52+
53+
; TODO: Remove the 2nd cmpl by using NF imul.
54+
define void @flag_copy_3(i32 %x, i32 %y, ptr %pa, ptr %pb, ptr %pc) nounwind {
55+
; CHECK-LABEL: flag_copy_3:
56+
; CHECK: # %bb.0: # %entry
57+
; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
58+
; CHECK-NEXT: cmpl $2, %edi
59+
; CHECK-NEXT: jl .LBB2_2
60+
; CHECK-NEXT: # %bb.1: # %bb1
61+
; CHECK-NEXT: movl %edi, %eax
62+
; CHECK-NEXT: imull %esi, %eax
63+
; CHECK-NEXT: movl %eax, (%rdx)
64+
; CHECK-NEXT: jmp .LBB2_3
65+
; CHECK-NEXT: .LBB2_2: # %bb2
66+
; CHECK-NEXT: leal -2(%rsi), %eax
67+
; CHECK-NEXT: movl %eax, (%rcx)
68+
; CHECK-NEXT: .LBB2_3: # %bb3
69+
; CHECK-NEXT: cmpl $2, %edi
70+
; CHECK-NEXT: cmovgel %edi, %esi
71+
; CHECK-NEXT: movl %esi, (%r8)
72+
; CHECK-NEXT: retq
73+
entry:
74+
%cmp = icmp sgt i32 %x, 1
75+
br i1 %cmp, label %bb1, label %bb2
76+
bb1:
77+
%add = mul nuw nsw i32 %x, %y
78+
store i32 %add, ptr %pa
79+
br label %bb3
80+
81+
bb2:
82+
%sub = sub nuw nsw i32 %y, 2
83+
store i32 %sub, ptr %pb
84+
br label %bb3
85+
86+
bb3:
87+
%s = select i1 %cmp, i32 %x, i32 %y
88+
store i32 %s, ptr %pc
89+
ret void
90+
}

0 commit comments

Comments
 (0)