Skip to content

Commit 38a44bd

Browse files
authored
[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf (#81572)
In commit 2b58244, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in #76338). This patch reverses the fabs+fcmp to `is.fpclass`. If the `is.fpclass` is not supported by the target, it will be expanded by TLI. Fixes the regression introduced by 2b58244 and #80414 (comment).
1 parent f362e12 commit 38a44bd

File tree

8 files changed

+642
-202
lines changed

8 files changed

+642
-202
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1943,6 +1943,39 @@ static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
19431943
return false;
19441944
}
19451945

1946+
static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
1947+
const DataLayout &DL) {
1948+
FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);
1949+
if (!FCmp)
1950+
return false;
1951+
1952+
// Don't fold if the target offers free fabs and the predicate is legal.
1953+
EVT VT = TLI.getValueType(DL, Cmp->getOperand(0)->getType());
1954+
if (TLI.isFAbsFree(VT) &&
1955+
TLI.isCondCodeLegal(getFCmpCondCode(FCmp->getPredicate()),
1956+
VT.getSimpleVT()))
1957+
return false;
1958+
1959+
// Reverse the canonicalization if it is a FP class test
1960+
auto ShouldReverseTransform = [](FPClassTest ClassTest) {
1961+
return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
1962+
};
1963+
auto [ClassVal, ClassTest] =
1964+
fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),
1965+
FCmp->getOperand(0), FCmp->getOperand(1));
1966+
if (!ClassVal)
1967+
return false;
1968+
1969+
if (!ShouldReverseTransform(ClassTest) && !ShouldReverseTransform(~ClassTest))
1970+
return false;
1971+
1972+
IRBuilder<> Builder(Cmp);
1973+
Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);
1974+
Cmp->replaceAllUsesWith(IsFPClass);
1975+
RecursivelyDeleteTriviallyDeadInstructions(Cmp);
1976+
return true;
1977+
}
1978+
19461979
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
19471980
if (sinkCmpExpression(Cmp, *TLI))
19481981
return true;
@@ -1959,6 +1992,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
19591992
if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
19601993
return true;
19611994

1995+
if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
1996+
return true;
1997+
19621998
return false;
19631999
}
19642000

llvm/test/CodeGen/AArch64/isinf.ll

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,22 +58,14 @@ define i32 @replace_isinf_call_f64(double %x) {
5858
define i32 @replace_isinf_call_f128(fp128 %x) {
5959
; CHECK-LABEL: replace_isinf_call_f128:
6060
; CHECK: // %bb.0:
61-
; CHECK-NEXT: sub sp, sp, #32
62-
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
63-
; CHECK-NEXT: .cfi_def_cfa_offset 32
64-
; CHECK-NEXT: .cfi_offset w30, -16
65-
; CHECK-NEXT: str q0, [sp]
66-
; CHECK-NEXT: ldrb w8, [sp, #15]
67-
; CHECK-NEXT: and w8, w8, #0x7f
68-
; CHECK-NEXT: strb w8, [sp, #15]
69-
; CHECK-NEXT: adrp x8, .LCPI3_0
70-
; CHECK-NEXT: ldr q0, [sp]
71-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
72-
; CHECK-NEXT: bl __eqtf2
73-
; CHECK-NEXT: cmp w0, #0
74-
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
61+
; CHECK-NEXT: str q0, [sp, #-16]!
62+
; CHECK-NEXT: .cfi_def_cfa_offset 16
63+
; CHECK-NEXT: ldp x9, x8, [sp], #16
64+
; CHECK-NEXT: and x8, x8, #0x7fffffffffffffff
65+
; CHECK-NEXT: eor x8, x8, #0x7fff000000000000
66+
; CHECK-NEXT: orr x8, x9, x8
67+
; CHECK-NEXT: cmp x8, #0
7568
; CHECK-NEXT: cset w0, eq
76-
; CHECK-NEXT: add sp, sp, #32
7769
; CHECK-NEXT: ret
7870
%abs = tail call fp128 @llvm.fabs.f128(fp128 %x)
7971
%cmpinf = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000

llvm/test/CodeGen/AMDGPU/fp-classify.ll

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -618,16 +618,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
618618
define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
619619
; SI-LABEL: test_isinf_pattern_f16:
620620
; SI: ; %bb.0:
621-
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
622-
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
623-
; SI-NEXT: s_mov_b32 s7, 0xf000
624-
; SI-NEXT: s_mov_b32 s6, -1
625-
; SI-NEXT: s_mov_b32 s1, 0x7f800000
621+
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
622+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
623+
; SI-NEXT: s_mov_b32 s3, 0xf000
624+
; SI-NEXT: s_mov_b32 s2, -1
626625
; SI-NEXT: s_waitcnt lgkmcnt(0)
627-
; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
628-
; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0
629-
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
630-
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
626+
; SI-NEXT: s_and_b32 s4, s4, 0x7fff
627+
; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00
628+
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
629+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
630+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
631631
; SI-NEXT: s_endpgm
632632
;
633633
; VI-LABEL: test_isinf_pattern_f16:
@@ -667,16 +667,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
667667
define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
668668
; SI-LABEL: test_isfinite_pattern_0_f16:
669669
; SI: ; %bb.0:
670-
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
671-
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
672-
; SI-NEXT: s_mov_b32 s7, 0xf000
673-
; SI-NEXT: s_mov_b32 s6, -1
674-
; SI-NEXT: s_movk_i32 s1, 0x1f8
670+
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
671+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
672+
; SI-NEXT: s_mov_b32 s3, 0xf000
673+
; SI-NEXT: s_mov_b32 s2, -1
675674
; SI-NEXT: s_waitcnt lgkmcnt(0)
676-
; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
677-
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
678-
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
679-
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
675+
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
676+
; SI-NEXT: s_and_b32 s4, s4, 0x7fff
677+
; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
678+
; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
679+
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
680+
; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
681+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
682+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
680683
; SI-NEXT: s_endpgm
681684
;
682685
; VI-LABEL: test_isfinite_pattern_0_f16:
@@ -718,16 +721,19 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
718721
define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
719722
; SI-LABEL: test_isfinite_pattern_4_f16:
720723
; SI: ; %bb.0:
721-
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
722-
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
723-
; SI-NEXT: s_mov_b32 s7, 0xf000
724-
; SI-NEXT: s_mov_b32 s6, -1
725-
; SI-NEXT: s_movk_i32 s1, 0x1f8
724+
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
725+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
726+
; SI-NEXT: s_mov_b32 s3, 0xf000
727+
; SI-NEXT: s_mov_b32 s2, -1
726728
; SI-NEXT: s_waitcnt lgkmcnt(0)
727-
; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
728-
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
729-
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
730-
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
729+
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
730+
; SI-NEXT: s_and_b32 s4, s4, 0x7fff
731+
; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
732+
; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00
733+
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
734+
; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
735+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
736+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
731737
; SI-NEXT: s_endpgm
732738
;
733739
; VI-LABEL: test_isfinite_pattern_4_f16:

0 commit comments

Comments
 (0)