Skip to content

Commit 8dc0fab

Browse files
committed
[CodeGenPrepare] Unfold slow ctpop when used in power-of-two test
DAG combiner already does this transformation, but in some cases it does not have a chance because either CodeGenPrepare or SelectionDAGBuilder move icmp to a different basic block.
1 parent 23e3237 commit 8dc0fab

File tree

7 files changed

+79
-182
lines changed

7 files changed

+79
-182
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ class CodeGenPrepare {
474474
bool optimizeURem(Instruction *Rem);
475475
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
476476
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
477+
bool unfoldPow2Test(CmpInst *Cmp);
477478
void verifyBFIUpdates(Function &F);
478479
bool _run(Function &F);
479480
};
@@ -1762,6 +1763,40 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
17621763
return true;
17631764
}
17641765

1766+
// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
1767+
bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
1768+
CmpPredicate Pred;
1769+
Value *X;
1770+
uint64_t C;
1771+
1772+
if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
1773+
m_ConstantInt(C))))
1774+
return false;
1775+
1776+
Type *Ty = X->getType();
1777+
if (Ty->isVectorTy() || TTI->getPopcntSupport(Ty->getIntegerBitWidth()) ==
1778+
TargetTransformInfo::PSK_FastHardware)
1779+
return false;
1780+
1781+
// (ctpop x) u< 2 -> (x & (x - 1)) == 0
1782+
// (ctpop x) u> 1 -> (x & (x - 1)) != 0
1783+
if ((Pred == CmpInst::ICMP_ULT && C == 2) ||
1784+
(Pred == CmpInst::ICMP_UGT && C == 1)) {
1785+
IRBuilder<> Builder(Cmp);
1786+
Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(Ty));
1787+
Value *And = Builder.CreateAnd(X, Sub);
1788+
CmpInst::Predicate NewPred =
1789+
Pred == CmpInst::ICMP_ULT ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
1790+
Value *NewCmp =
1791+
Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(Ty));
1792+
Cmp->replaceAllUsesWith(NewCmp);
1793+
RecursivelyDeleteTriviallyDeadInstructions(Cmp);
1794+
return true;
1795+
}
1796+
1797+
return false;
1798+
}
1799+
17651800
/// Sink the given CmpInst into user blocks to reduce the number of virtual
17661801
/// registers that must be created and coalesced. This is a clear win except on
17671802
/// targets with multiple condition code registers (PowerPC), where it might
@@ -2183,6 +2218,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
21832218
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
21842219
return true;
21852220

2221+
if (unfoldPow2Test(Cmp))
2222+
return true;
2223+
21862224
if (foldICmpWithDominatingICmp(Cmp, *TLI))
21872225
return true;
21882226

llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll

Lines changed: 16 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -357,49 +357,14 @@ define i64 @ctpop_i64(i64 %a) nounwind {
357357
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
358358
; RV32I-LABEL: ctpop_i64_ugt_two:
359359
; RV32I: # %bb.0:
360-
; RV32I-NEXT: j .LBB6_2
361-
; RV32I-NEXT: # %bb.1:
362-
; RV32I-NEXT: sltiu a0, zero, 0
363-
; RV32I-NEXT: ret
364-
; RV32I-NEXT: .LBB6_2:
365-
; RV32I-NEXT: srli a2, a0, 1
366-
; RV32I-NEXT: lui a3, 349525
367-
; RV32I-NEXT: lui a4, 209715
368-
; RV32I-NEXT: srli a5, a1, 1
369-
; RV32I-NEXT: addi a3, a3, 1365
370-
; RV32I-NEXT: and a2, a2, a3
371-
; RV32I-NEXT: and a3, a5, a3
372-
; RV32I-NEXT: lui a5, 61681
373-
; RV32I-NEXT: addi a4, a4, 819
374-
; RV32I-NEXT: addi a5, a5, -241
375-
; RV32I-NEXT: sub a0, a0, a2
376-
; RV32I-NEXT: sub a1, a1, a3
377-
; RV32I-NEXT: srli a2, a0, 2
378-
; RV32I-NEXT: and a0, a0, a4
379-
; RV32I-NEXT: srli a3, a1, 2
380-
; RV32I-NEXT: and a1, a1, a4
381-
; RV32I-NEXT: and a2, a2, a4
382-
; RV32I-NEXT: and a3, a3, a4
383-
; RV32I-NEXT: add a0, a2, a0
384-
; RV32I-NEXT: add a1, a3, a1
385-
; RV32I-NEXT: srli a2, a0, 4
386-
; RV32I-NEXT: srli a3, a1, 4
387-
; RV32I-NEXT: add a0, a2, a0
388-
; RV32I-NEXT: add a1, a3, a1
389-
; RV32I-NEXT: and a0, a0, a5
390-
; RV32I-NEXT: and a1, a1, a5
391-
; RV32I-NEXT: slli a2, a0, 8
392-
; RV32I-NEXT: slli a3, a1, 8
393-
; RV32I-NEXT: add a0, a0, a2
394-
; RV32I-NEXT: add a1, a1, a3
395-
; RV32I-NEXT: slli a2, a0, 16
396-
; RV32I-NEXT: slli a3, a1, 16
397-
; RV32I-NEXT: add a0, a0, a2
398-
; RV32I-NEXT: add a1, a1, a3
399-
; RV32I-NEXT: srli a0, a0, 24
400-
; RV32I-NEXT: srli a1, a1, 24
401-
; RV32I-NEXT: add a0, a1, a0
402-
; RV32I-NEXT: sltiu a0, a0, 2
360+
; RV32I-NEXT: addi a2, a0, -1
361+
; RV32I-NEXT: addi a3, a1, -1
362+
; RV32I-NEXT: sltiu a4, a2, -1
363+
; RV32I-NEXT: add a3, a3, a4
364+
; RV32I-NEXT: and a0, a0, a2
365+
; RV32I-NEXT: and a1, a1, a3
366+
; RV32I-NEXT: or a0, a0, a1
367+
; RV32I-NEXT: seqz a0, a0
403368
; RV32I-NEXT: ret
404369
;
405370
; RV32ZBB-LABEL: ctpop_i64_ugt_two:
@@ -422,50 +387,14 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
422387
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
423388
; RV32I-LABEL: ctpop_i64_ugt_one:
424389
; RV32I: # %bb.0:
425-
; RV32I-NEXT: j .LBB7_2
426-
; RV32I-NEXT: # %bb.1:
427-
; RV32I-NEXT: snez a0, zero
428-
; RV32I-NEXT: ret
429-
; RV32I-NEXT: .LBB7_2:
430-
; RV32I-NEXT: srli a2, a0, 1
431-
; RV32I-NEXT: lui a3, 349525
432-
; RV32I-NEXT: lui a4, 209715
433-
; RV32I-NEXT: srli a5, a1, 1
434-
; RV32I-NEXT: addi a3, a3, 1365
435-
; RV32I-NEXT: and a2, a2, a3
436-
; RV32I-NEXT: and a3, a5, a3
437-
; RV32I-NEXT: lui a5, 61681
438-
; RV32I-NEXT: addi a4, a4, 819
439-
; RV32I-NEXT: addi a5, a5, -241
440-
; RV32I-NEXT: sub a0, a0, a2
441-
; RV32I-NEXT: sub a1, a1, a3
442-
; RV32I-NEXT: srli a2, a0, 2
443-
; RV32I-NEXT: and a0, a0, a4
444-
; RV32I-NEXT: srli a3, a1, 2
445-
; RV32I-NEXT: and a1, a1, a4
446-
; RV32I-NEXT: and a2, a2, a4
447-
; RV32I-NEXT: and a3, a3, a4
448-
; RV32I-NEXT: add a0, a2, a0
449-
; RV32I-NEXT: add a1, a3, a1
450-
; RV32I-NEXT: srli a2, a0, 4
451-
; RV32I-NEXT: srli a3, a1, 4
452-
; RV32I-NEXT: add a0, a2, a0
453-
; RV32I-NEXT: add a1, a3, a1
454-
; RV32I-NEXT: and a0, a0, a5
455-
; RV32I-NEXT: and a1, a1, a5
456-
; RV32I-NEXT: slli a2, a0, 8
457-
; RV32I-NEXT: slli a3, a1, 8
458-
; RV32I-NEXT: add a0, a0, a2
459-
; RV32I-NEXT: add a1, a1, a3
460-
; RV32I-NEXT: slli a2, a0, 16
461-
; RV32I-NEXT: slli a3, a1, 16
462-
; RV32I-NEXT: add a0, a0, a2
463-
; RV32I-NEXT: add a1, a1, a3
464-
; RV32I-NEXT: srli a0, a0, 24
465-
; RV32I-NEXT: srli a1, a1, 24
466-
; RV32I-NEXT: add a0, a1, a0
467-
; RV32I-NEXT: sltiu a0, a0, 2
468-
; RV32I-NEXT: xori a0, a0, 1
390+
; RV32I-NEXT: addi a2, a0, -1
391+
; RV32I-NEXT: addi a3, a1, -1
392+
; RV32I-NEXT: sltiu a4, a2, -1
393+
; RV32I-NEXT: add a3, a3, a4
394+
; RV32I-NEXT: and a0, a0, a2
395+
; RV32I-NEXT: and a1, a1, a3
396+
; RV32I-NEXT: or a0, a0, a1
397+
; RV32I-NEXT: snez a0, a0
469398
; RV32I-NEXT: ret
470399
;
471400
; RV32ZBB-LABEL: ctpop_i64_ugt_one:

llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -701,31 +701,9 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
701701
define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
702702
; RV64I-LABEL: ctpop_i32_ult_two:
703703
; RV64I: # %bb.0:
704-
; RV64I-NEXT: addi sp, sp, -16
705-
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
706-
; RV64I-NEXT: srliw a1, a0, 1
707-
; RV64I-NEXT: lui a2, 349525
708-
; RV64I-NEXT: addi a2, a2, 1365
709-
; RV64I-NEXT: and a1, a1, a2
710-
; RV64I-NEXT: lui a2, 209715
711-
; RV64I-NEXT: addi a2, a2, 819
712-
; RV64I-NEXT: subw a0, a0, a1
713-
; RV64I-NEXT: srliw a1, a0, 2
714-
; RV64I-NEXT: and a0, a0, a2
715-
; RV64I-NEXT: and a1, a1, a2
716-
; RV64I-NEXT: lui a2, 61681
717-
; RV64I-NEXT: add a0, a1, a0
718-
; RV64I-NEXT: sraiw a1, a0, 4
719-
; RV64I-NEXT: addw a0, a1, a0
720-
; RV64I-NEXT: lui a1, 4112
721-
; RV64I-NEXT: addiw a2, a2, -241
722-
; RV64I-NEXT: and a0, a0, a2
723-
; RV64I-NEXT: addiw a1, a1, 257
724-
; RV64I-NEXT: call __muldi3
725-
; RV64I-NEXT: srliw a0, a0, 24
726-
; RV64I-NEXT: sltiu a0, a0, 2
727-
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
728-
; RV64I-NEXT: addi sp, sp, 16
704+
; RV64I-NEXT: addiw a1, a0, -1
705+
; RV64I-NEXT: and a0, a0, a1
706+
; RV64I-NEXT: seqz a0, a0
729707
; RV64I-NEXT: ret
730708
;
731709
; RV64ZBB-LABEL: ctpop_i32_ult_two:

llvm/test/CodeGen/RISCV/pr101786.ll

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,9 @@ define i64 @test(i64 %x, ptr %p) {
88
; CHECK-NEXT: li a0, 0
99
; CHECK-NEXT: bgtz a2, .LBB0_3
1010
; CHECK-NEXT: # %bb.1: # %entry
11-
; CHECK-NEXT: srli a3, a2, 1
12-
; CHECK-NEXT: lui a4, 349525
13-
; CHECK-NEXT: addiw a4, a4, 1365
14-
; CHECK-NEXT: slli a5, a4, 32
15-
; CHECK-NEXT: add a4, a4, a5
16-
; CHECK-NEXT: and a3, a3, a4
17-
; CHECK-NEXT: sub a2, a2, a3
18-
; CHECK-NEXT: lui a3, 209715
19-
; CHECK-NEXT: addiw a3, a3, 819
20-
; CHECK-NEXT: slli a4, a3, 32
21-
; CHECK-NEXT: add a3, a3, a4
22-
; CHECK-NEXT: and a4, a2, a3
23-
; CHECK-NEXT: srli a2, a2, 2
11+
; CHECK-NEXT: addi a3, a2, -1
2412
; CHECK-NEXT: and a2, a2, a3
25-
; CHECK-NEXT: add a2, a4, a2
26-
; CHECK-NEXT: srli a3, a2, 4
27-
; CHECK-NEXT: add a2, a2, a3
28-
; CHECK-NEXT: lui a3, 61681
29-
; CHECK-NEXT: addiw a3, a3, -241
30-
; CHECK-NEXT: slli a4, a3, 32
31-
; CHECK-NEXT: add a3, a3, a4
32-
; CHECK-NEXT: and a2, a2, a3
33-
; CHECK-NEXT: slli a3, a2, 8
34-
; CHECK-NEXT: add a2, a2, a3
35-
; CHECK-NEXT: slli a3, a2, 16
36-
; CHECK-NEXT: add a2, a2, a3
37-
; CHECK-NEXT: slli a3, a2, 32
38-
; CHECK-NEXT: add a2, a2, a3
39-
; CHECK-NEXT: srli a2, a2, 56
40-
; CHECK-NEXT: li a3, 1
41-
; CHECK-NEXT: bltu a3, a2, .LBB0_3
13+
; CHECK-NEXT: bnez a2, .LBB0_3
4214
; CHECK-NEXT: # %bb.2: # %if.else
4315
; CHECK-NEXT: ld a0, 0(a1)
4416
; CHECK-NEXT: .LBB0_3: # %if.end

llvm/test/CodeGen/RISCV/rv32zbb.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind {
571571
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
572572
; RV32I-LABEL: ctpop_i64_ugt_two:
573573
; RV32I: # %bb.0:
574-
; RV32I-NEXT: addi a2, a0, -1
575-
; RV32I-NEXT: and a2, a0, a2
576-
; RV32I-NEXT: seqz a0, a0
577-
; RV32I-NEXT: sub a0, a1, a0
578-
; RV32I-NEXT: and a0, a1, a0
579-
; RV32I-NEXT: or a0, a2, a0
574+
; RV32I-NEXT: seqz a2, a0
575+
; RV32I-NEXT: addi a3, a0, -1
576+
; RV32I-NEXT: sub a2, a1, a2
577+
; RV32I-NEXT: and a0, a0, a3
578+
; RV32I-NEXT: and a1, a1, a2
579+
; RV32I-NEXT: or a0, a0, a1
580580
; RV32I-NEXT: seqz a0, a0
581581
; RV32I-NEXT: ret
582582
;
@@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
595595
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
596596
; RV32I-LABEL: ctpop_i64_ugt_one:
597597
; RV32I: # %bb.0:
598-
; RV32I-NEXT: addi a2, a0, -1
599-
; RV32I-NEXT: and a2, a0, a2
600-
; RV32I-NEXT: seqz a0, a0
601-
; RV32I-NEXT: sub a0, a1, a0
602-
; RV32I-NEXT: and a0, a1, a0
603-
; RV32I-NEXT: or a0, a2, a0
598+
; RV32I-NEXT: seqz a2, a0
599+
; RV32I-NEXT: addi a3, a0, -1
600+
; RV32I-NEXT: sub a2, a1, a2
601+
; RV32I-NEXT: and a0, a0, a3
602+
; RV32I-NEXT: and a1, a1, a2
603+
; RV32I-NEXT: or a0, a0, a1
604604
; RV32I-NEXT: snez a0, a0
605605
; RV32I-NEXT: ret
606606
;

llvm/test/CodeGen/X86/pr94829.ll

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,8 @@
44
define ptr @test(i64 %x) {
55
; CHECK-LABEL: test:
66
; CHECK: # %bb.0: # %entry
7-
; CHECK-NEXT: movq %rdi, %rax
8-
; CHECK-NEXT: shrq %rax
9-
; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
10-
; CHECK-NEXT: andq %rax, %rcx
11-
; CHECK-NEXT: subq %rcx, %rdi
12-
; CHECK-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
13-
; CHECK-NEXT: movq %rdi, %rcx
14-
; CHECK-NEXT: andq %rax, %rcx
15-
; CHECK-NEXT: shrq $2, %rdi
16-
; CHECK-NEXT: andq %rax, %rdi
17-
; CHECK-NEXT: addq %rcx, %rdi
18-
; CHECK-NEXT: movq %rdi, %rax
19-
; CHECK-NEXT: shrq $4, %rax
20-
; CHECK-NEXT: addq %rdi, %rax
21-
; CHECK-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
22-
; CHECK-NEXT: andq %rax, %rcx
23-
; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
24-
; CHECK-NEXT: imulq %rcx, %rax
25-
; CHECK-NEXT: shrq $56, %rax
26-
; CHECK-NEXT: cmpq $2, %rax
27-
; CHECK-NEXT: jb .LBB0_2
28-
; CHECK-NEXT: # %bb.1: # %if.else
29-
; CHECK-NEXT: cmpl $2, %eax
30-
; CHECK-NEXT: .LBB0_2: # %exit1
7+
; CHECK-NEXT: leaq -1(%rdi), %rax
8+
; CHECK-NEXT: testq %rax, %rdi
319
; CHECK-NEXT: xorl %eax, %eax
3210
; CHECK-NEXT: retq
3311
entry:

llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
1212
; SLOW-LABEL: define i64 @test_ult_2(
1313
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
1414
; SLOW-NEXT: [[ENTRY:.*]]:
15-
; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
16-
; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
15+
; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
16+
; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
17+
; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
1718
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
1819
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
1920
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -56,8 +57,9 @@ define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
5657
; SLOW-LABEL: define i64 @test_ugt_1(
5758
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
5859
; SLOW-NEXT: [[ENTRY:.*]]:
59-
; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
60-
; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
60+
; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
61+
; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
62+
; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
6163
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
6264
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
6365
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]

0 commit comments

Comments
 (0)