-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[SPARC] Use lzcnt to implement CTLZ when we have VIS3 #135715
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARC] Use lzcnt to implement CTLZ when we have VIS3 #135715
Conversation
Created using spr 1.3.5 [skip ci]
Created using spr 1.3.5
@llvm/pr-subscribers-backend-sparc Author: Koakuma (koachan) ChangesFull diff: https://github.com/llvm/llvm-project/pull/135715.diff 3 Files Affected:
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 0ad261135651f..c34a55bb2881b 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1753,7 +1753,8 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::i64,
Subtarget->usePopc() ? Legal : Expand);
setOperationAction(ISD::CTTZ , MVT::i64, Expand);
- setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+ setOperationAction(ISD::CTLZ, MVT::i64,
+ Subtarget->isVIS3() ? Legal : Expand);
setOperationAction(ISD::BSWAP, MVT::i64, Expand);
setOperationAction(ISD::ROTL , MVT::i64, Expand);
setOperationAction(ISD::ROTR , MVT::i64, Expand);
@@ -1815,7 +1816,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM , MVT::f32, Expand);
setOperationAction(ISD::FMA , MVT::f32, Expand);
setOperationAction(ISD::CTTZ , MVT::i32, Expand);
- setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+ setOperationAction(ISD::CTLZ, MVT::i32, Subtarget->isVIS3() ? Legal : Expand);
setOperationAction(ISD::ROTL , MVT::i32, Expand);
setOperationAction(ISD::ROTR , MVT::i32, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td
index 925bcdc9070fa..241d6bc11e963 100644
--- a/llvm/lib/Target/Sparc/SparcInstrVIS.td
+++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td
@@ -303,4 +303,10 @@ def : Pat<(i64 (mulhs i64:$lhs, i64:$rhs)),
(SUBrr (UMULXHI $lhs, $rhs),
(ADDrr (ANDrr (SRAXri $lhs, 63), $rhs),
(ANDrr (SRAXri $rhs, 63), $lhs)))>;
+
+def : Pat<(i64 (ctlz i64:$src)), (LZCNT $src)>;
+// 32-bit LZCNT.
+// The zero extension will leave us with 32 extra leading zeros,
+// so we need to compensate for it.
+def : Pat<(i32 (ctlz i32:$src)), (ADDri (LZCNT (SRLri $src, 0)), (i32 -32))>;
} // Predicates = [HasVIS3]
diff --git a/llvm/test/CodeGen/SPARC/ctlz.ll b/llvm/test/CodeGen/SPARC/ctlz.ll
new file mode 100644
index 0000000000000..3b2fc0dbfd4a3
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/ctlz.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=sparcv9 | FileCheck %s -check-prefix=V9
+; RUN: llc < %s -mtriple=sparcv9 -mattr=popc | FileCheck %s -check-prefix=POPC
+; RUN: llc < %s -mtriple=sparcv9 -mattr=vis3 | FileCheck %s -check-prefix=VIS3
+
+define i32 @f(i32 %x) nounwind {
+; V9-LABEL: f:
+; V9: ! %bb.0: ! %entry
+; V9-NEXT: srl %o0, 1, %o1
+; V9-NEXT: or %o0, %o1, %o1
+; V9-NEXT: srl %o1, 2, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srl %o1, 4, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srl %o1, 8, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srl %o1, 16, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: xor %o1, -1, %o1
+; V9-NEXT: srl %o1, 1, %o2
+; V9-NEXT: sethi 1398101, %o3
+; V9-NEXT: or %o3, 341, %o3
+; V9-NEXT: and %o2, %o3, %o2
+; V9-NEXT: sub %o1, %o2, %o1
+; V9-NEXT: sethi 838860, %o2
+; V9-NEXT: or %o2, 819, %o2
+; V9-NEXT: and %o1, %o2, %o3
+; V9-NEXT: srl %o1, 2, %o1
+; V9-NEXT: and %o1, %o2, %o1
+; V9-NEXT: add %o3, %o1, %o1
+; V9-NEXT: srl %o1, 4, %o2
+; V9-NEXT: add %o1, %o2, %o1
+; V9-NEXT: sethi 246723, %o2
+; V9-NEXT: or %o2, 783, %o2
+; V9-NEXT: and %o1, %o2, %o1
+; V9-NEXT: sll %o1, 8, %o2
+; V9-NEXT: add %o1, %o2, %o1
+; V9-NEXT: sll %o1, 16, %o2
+; V9-NEXT: add %o1, %o2, %o1
+; V9-NEXT: srl %o1, 24, %o1
+; V9-NEXT: cmp %o0, 0
+; V9-NEXT: move %icc, 0, %o1
+; V9-NEXT: retl
+; V9-NEXT: mov %o1, %o0
+;
+; POPC-LABEL: f:
+; POPC: ! %bb.0: ! %entry
+; POPC-NEXT: srl %o0, 1, %o1
+; POPC-NEXT: or %o0, %o1, %o1
+; POPC-NEXT: srl %o1, 2, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srl %o1, 4, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srl %o1, 8, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srl %o1, 16, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: xor %o1, -1, %o1
+; POPC-NEXT: srl %o1, 0, %o1
+; POPC-NEXT: popc %o1, %o1
+; POPC-NEXT: cmp %o0, 0
+; POPC-NEXT: move %icc, 0, %o1
+; POPC-NEXT: retl
+; POPC-NEXT: mov %o1, %o0
+;
+; VIS3-LABEL: f:
+; VIS3: ! %bb.0: ! %entry
+; VIS3-NEXT: srl %o0, 0, %o1
+; VIS3-NEXT: lzcnt %o1, %o1
+; VIS3-NEXT: add %o1, -32, %o1
+; VIS3-NEXT: cmp %o0, 0
+; VIS3-NEXT: move %icc, 0, %o1
+; VIS3-NEXT: retl
+; VIS3-NEXT: mov %o1, %o0
+entry:
+ %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+ %1 = icmp eq i32 %x, 0
+ %2 = select i1 %1, i32 0, i32 %0
+ %3 = trunc i32 %2 to i8
+ %conv = zext i8 %3 to i32
+ ret i32 %conv
+}
+
+define i64 @g(i64 %x) nounwind {
+; V9-LABEL: g:
+; V9: ! %bb.0: ! %entry
+; V9-NEXT: srlx %o0, 1, %o1
+; V9-NEXT: or %o0, %o1, %o1
+; V9-NEXT: srlx %o1, 2, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srlx %o1, 4, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srlx %o1, 8, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srlx %o1, 16, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: srlx %o1, 32, %o2
+; V9-NEXT: or %o1, %o2, %o1
+; V9-NEXT: xor %o1, -1, %o1
+; V9-NEXT: srlx %o1, 1, %o2
+; V9-NEXT: sethi 1398101, %o3
+; V9-NEXT: or %o3, 341, %o3
+; V9-NEXT: sllx %o3, 32, %o4
+; V9-NEXT: or %o4, %o3, %o3
+; V9-NEXT: and %o2, %o3, %o2
+; V9-NEXT: sub %o1, %o2, %o1
+; V9-NEXT: sethi 838860, %o2
+; V9-NEXT: or %o2, 819, %o2
+; V9-NEXT: sllx %o2, 32, %o3
+; V9-NEXT: or %o3, %o2, %o2
+; V9-NEXT: and %o1, %o2, %o3
+; V9-NEXT: srlx %o1, 2, %o1
+; V9-NEXT: and %o1, %o2, %o1
+; V9-NEXT: add %o3, %o1, %o1
+; V9-NEXT: srlx %o1, 4, %o2
+; V9-NEXT: add %o1, %o2, %o1
+; V9-NEXT: sethi 246723, %o2
+; V9-NEXT: or %o2, 783, %o2
+; V9-NEXT: sllx %o2, 32, %o3
+; V9-NEXT: or %o3, %o2, %o2
+; V9-NEXT: and %o1, %o2, %o1
+; V9-NEXT: sethi 16448, %o2
+; V9-NEXT: or %o2, 257, %o2
+; V9-NEXT: sllx %o2, 32, %o3
+; V9-NEXT: or %o3, %o2, %o2
+; V9-NEXT: mulx %o1, %o2, %o1
+; V9-NEXT: srlx %o1, 56, %o1
+; V9-NEXT: movrz %o0, 0, %o1
+; V9-NEXT: retl
+; V9-NEXT: mov %o1, %o0
+;
+; POPC-LABEL: g:
+; POPC: ! %bb.0: ! %entry
+; POPC-NEXT: srlx %o0, 1, %o1
+; POPC-NEXT: or %o0, %o1, %o1
+; POPC-NEXT: srlx %o1, 2, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srlx %o1, 4, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srlx %o1, 8, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srlx %o1, 16, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: srlx %o1, 32, %o2
+; POPC-NEXT: or %o1, %o2, %o1
+; POPC-NEXT: xor %o1, -1, %o1
+; POPC-NEXT: popc %o1, %o1
+; POPC-NEXT: movrz %o0, 0, %o1
+; POPC-NEXT: retl
+; POPC-NEXT: mov %o1, %o0
+;
+; VIS3-LABEL: g:
+; VIS3: ! %bb.0: ! %entry
+; VIS3-NEXT: lzcnt %o0, %o1
+; VIS3-NEXT: movrz %o0, 0, %o1
+; VIS3-NEXT: retl
+; VIS3-NEXT: mov %o1, %o0
+entry:
+ %0 = call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+ %1 = icmp eq i64 %x, 0
+ %2 = select i1 %1, i64 0, i64 %0
+ %3 = trunc i64 %2 to i32
+ %conv = zext i32 %3 to i64
+ ret i64 %conv
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.ctlz.i32(i32, i1 immarg) #0
+declare i64 @llvm.ctlz.i64(i64, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
|
Created using spr 1.3.5 [skip ci]
Created using spr 1.3.5
Created using spr 1.3.5 [skip ci]
Created using spr 1.3.5
Created using spr 1.3.5 [skip ci]
Created using spr 1.3.5
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks great, I left some final comments (few of them are actionable).
There is also a couple of methods in TargetLoweringBase that should be implemented:
isCheapToSpeculateCtlz()
, isCtlzFast()
. I think they could improve codegen a bit.
Not required in this PR though.
llvm/test/CodeGen/SPARC/ctlz.ll
Outdated
; VIS3-LABEL: i32_nopoison: | ||
; VIS3: ! %bb.0: | ||
; VIS3-NEXT: cmp %o0, 0 | ||
; VIS3-NEXT: be %icc, .LBB0_2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The branch is redundant, but I guess there is nothing we can do with it right now. It must've been created too early.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Those turns out to be controlled by the codegen parameters you mentioned above, it seems.
llvm/test/CodeGen/SPARC/ctlz.ll
Outdated
; V9-NEXT: call __clzdi2 | ||
; V9-NEXT: sllx %i0, 32, %o0 | ||
; V9-NEXT: ret | ||
; V9-NEXT: restore %g0, %o0, %o0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder why gcc produced another shift and an add...
llvm/test/CodeGen/SPARC/ctlz.ll
Outdated
; | ||
; VIS3-LABEL: i64_nopoison: | ||
; VIS3: ! %bb.0: | ||
; VIS3-NEXT: brz %o0, .LBB2_2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is odd, I'd expect no branches here.
Created using spr 1.3.5
Created using spr 1.3.5
Created using spr 1.3.5
Reviewers: s-barannikov, brad0, rorth Reviewed By: s-barannikov Pull Request: llvm/llvm-project#135715
Unfortunately, this patch causes a bunch of new failures:
However, the are only seen in a 2-stage build and only in a When spot-checking a few of the failing tests, the cause seemed to be always the same: the tests run into infinite recursion in |
There is a special hack in #if !defined(__clang__) && \
((defined(__sparc__) && defined(__arch64__)) || defined(__mips64) || \
(defined(__riscv) && __SIZEOF_POINTER__ >= 8))
// On 64-bit architectures with neither a native clz instruction nor a native
// ctz instruction, gcc resolves __builtin_clz to __clzdi2 rather than
// __clzsi2, leading to infinite recursion.
#define __builtin_clz(a) __clzsi2(a)
extern int __clzsi2(si_int);
#endif It is for gcc, but it looks like we've discovered the same issue with clang? |
It does. I'm attaching disassembly without and with this patch. |
IIRC at least on sparc64 it's because GCC's runtime doesn't seem to ship with
Hmmm so upon reading the clzdi2.c source, it seems to rely on
And now that |
It appears that removing the |
Reviewers: s-barannikov, brad0, rorth Reviewed By: s-barannikov Pull Request: llvm#135715
Reviewers: s-barannikov, brad0, rorth Reviewed By: s-barannikov Pull Request: llvm#135715
No description provided.