-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[llvm][SPIRV] Expose fast popcnt
support for SPIR-V targets
#109845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-spir-v Author: Alex Voicu (AlexVlx) ChangesThis adds the TTI predicate for conveying the availability of fast Full diff: https://github.com/llvm/llvm-project/pull/109845.diff 4 Files Affected:
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 2fbb4381da2637..7a64aa81218a89 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -24,6 +24,7 @@
namespace llvm {
class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
using BaseT = BasicTTIImplBase<SPIRVTTIImpl>;
+ using TTI = TargetTransformInfo;
friend BaseT;
@@ -37,6 +38,15 @@ class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F)
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+ // SPIR-V natively supports OpBitcount, per 3.53.14 in the spec, as such it
+ // is reasonable to assume the Op is fast / preferable to the expanded loop.
+ // Furthermore, this prevents information being lost if transforms are
+ // applied to SPIR-V before lowering to a concrete target.
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ return TTI::PSK_FastHardware;
+ }
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll b/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
index 5f49d6c05ae4ed..f030d188b890ef 100644
--- a/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
+++ b/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
@@ -1,4 +1,5 @@
; RUN: opt -passes=loop-idiom -mtriple=amdgcn-- -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom -mtriple=spirv64-amd-amdhsa -S < %s | FileCheck %s
; Mostly copied from x86 version.
diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg
new file mode 100644
index 00000000000000..78dd74cd6dc634
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "SPIRV" in config.root.targets:
+ config.unsupported = True
diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll
new file mode 100644
index 00000000000000..661e40dd96ddf8
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll
@@ -0,0 +1,128 @@
+; RUN: opt -passes=loop-idiom -mtriple=spirv32-- -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom -mtriple=spirv64-- -S < %s | FileCheck %s
+
+; Mostly copied from x86 version.
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+; int c = 0;
+; while (a) {
+; c++;
+; a &= a - 1;
+; }
+; return c;
+;}
+;
+
+; CHECK-LABEL: @popcount_i64
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i64 %a, 0
+ br i1 %tobool3, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.05, 1
+ %sub = add i64 %a.addr.04, -1
+ %and = and i64 %sub, %a.addr.04
+ %tobool = icmp eq i64 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i32
+; CHECK: entry
+; CHECK: llvm.ctpop.i32
+; CHECK: ret
+define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i32 %a, 0
+ br i1 %tobool3, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.05, 1
+ %sub = add i32 %a.addr.04, -1
+ %and = and i32 %sub, %a.addr.04
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i128
+; CHECK: entry
+; CHECK: llvm.ctpop.i128
+; CHECK: ret
+define i32 @popcount_i128(i128 %a) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i128 %a, 0
+ br i1 %tobool3, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.04 = phi i128 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.05, 1
+ %sub = add i128 %a.addr.04, -1
+ %and = and i128 %sub, %a.addr.04
+ %tobool = icmp eq i128 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+; int c = 0;
+; while (a) {
+; c++;
+; a &= a - 1;
+; mydata1 *= c;
+; mydata2 *= (int)a;
+; }
+; return c + mydata1 + mydata2;
+;}
+
+; CHECK-LABEL: @popcount2
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+entry:
+ %tobool9 = icmp eq i64 %a, 0
+ br i1 %tobool9, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+ %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+ %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.013, 1
+ %sub = add i64 %a.addr.010, -1
+ %and = and i64 %sub, %a.addr.010
+ %mul = mul nsw i32 %inc, %mydata1.addr.011
+ %conv = trunc i64 %and to i32
+ %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+ %tobool = icmp eq i64 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+ %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+ %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+ %add2 = add i32 %add, %c.0.lcssa
+ ret i32 %add2
+}
|
@@ -37,6 +38,15 @@ class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> { | |||
explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F) | |||
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)), | |||
TLI(ST->getTargetLowering()) {} | |||
|
|||
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TTI probably should have a better default checking if the operation is legal. It's also not great that TargetLowering also has isCtpopFast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but really we should fix the defaults in the BasicTTIImpl
The first concern coming to my mind is that we would need to have a test case with LIT CHECK's of the output SPIR-V to see/validate how ctpop and G_CTPOP are being lowered. We have test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll, but I wonder would it make sense to add a full-blown test case from unoptimized llvm ir to spir-v, what do you think? |
I think this makes sense, can never have too many tests; we can probably mirror the |
I'll pick your brains over this because I hadn't thought about this at all before you mentioned it:) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The first concern coming to my mind is that we would need to have a test case with LIT CHECK's of the output SPIR-V to see/validate how ctpop and G_CTPOP are being lowered. We have test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll, but I wonder would it make sense to add a full-blown test case from unoptimized llvm ir to spir-v, what do you think?
Please see llvm/test/CodeGen/SPIRV/optimizations/recognize-popcnt-loop.ll
for an attempt to do this.
@@ -0,0 +1,114 @@ | |||
; RUN: opt -O3 -mtriple=spirv32-- %s -o - | llc -O3 -mtriple=spirv32-- -o - | FileCheck %s | |||
; RUN: %if spirv-tools %{ opt -O3 -mtriple=spirv32-- %s -o - | llc -O3 -mtriple=spirv32-- -o - -filetype=obj | spirv-val %} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As a nit, it'd be useful to run this as llc -verify-machineinstrs ...
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is actually a great idea, because it tickles something that looks like a notable bug - that insertion is incorrect, we should skip over PHIs if the successor is a PHI. However, doing that doesn't quite solve it, as vreg lifetimes get messed up and verification still fails. I'll need to spend a bit more time to understand the transform. This can be verified, independently from, this patch, by using -verify-machineinstrs
on e.g. /llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll
. So this doesn't make things worse - we could merge it and deal with the issue separately, I reckon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that insertion is incorrect
yes, this has already been addressed in #109660 (
llvm-project/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
Lines 344 to 366 in 01c7776
static void setInsertPtAfterDef(MachineIRBuilder &MIB, MachineInstr *Def) { | |
MachineBasicBlock &MBB = *Def->getParent(); | |
MachineBasicBlock::iterator DefIt = | |
Def->getNextNode() ? Def->getNextNode()->getIterator() : MBB.end(); | |
// Skip all the PHI and debug instructions. | |
while (DefIt != MBB.end() && | |
(DefIt->isPHI() || DefIt->isDebugOrPseudoInstr())) | |
DefIt = std::next(DefIt); | |
MIB.setInsertPt(MBB, DefIt); | |
} | |
// Insert ASSIGN_TYPE instuction between Reg and its definition, set NewReg as | |
// a dst of the definition, assign SPIRVType to both registers. If SpvType is | |
// provided, use it as SPIRVType in ASSIGN_TYPE, otherwise create it from Ty. | |
// It's used also in SPIRVBuiltins.cpp. | |
// TODO: maybe move to SPIRVUtils. | |
namespace llvm { | |
Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType, | |
SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB, | |
MachineRegisterInfo &MRI) { | |
assert((Ty || SpvType) && "Either LLVM or SPIRV type is expected."); | |
MachineInstr *Def = MRI.getVRegDef(Reg); | |
setInsertPtAfterDef(MIB, Def); |
vreg lifetimes get messed up and verification still fails
I didn't see the case to be sure, but I'd guess that it's the same problem as we're discussing currently within #110019
LGTM overall, however, did you check failed Github action logs? I noticed that fails are related to this PR: |
I have looked at the failure, it's rather odd; the log would suggest that the host's |
Please don't. The codegen test should only cover codegen of the intrinsic. Adding in the extra steps of pattern matching the intrinsic in a far removed optimization pass does not improve coverage |
This makes sense as well, thank you. I'm ok with any of options. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/4464 Here is the relevant piece of the build log for the reference
|
Adds the following patches AMDGPU: Remove wavefrontsize64 feature from dummy target llvm#117410 [LLVM][NFC] Use used's element type if available llvm#116804 [llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early llvm#114481 [clang][Driver][HIP] Add support for mixing AMDGCNSPIRV & concrete offload-archs. llvm#113509 [clang][llvm][SPIR-V] Explicitly encode native integer widths for SPIR-V llvm#110695 [llvm][opt][Transforms] Replacement calloc should match replaced malloc llvm#110524 [clang][HIP] Don't use the OpenCLKernel CC when targeting AMDGCNSPIRV llvm#110447 [cuda][HIP] constant should imply constant llvm#110182 [llvm][SPIRV] Expose fast popcnt support for SPIR-V targets llvm#109845 [clang][CodeGen][SPIR-V] Fix incorrect SYCL usage, implement missing interface llvm#109415 [SPIRV][RFC] Rework / extend support for memory scopes llvm#106429 [clang][CodeGen][SPIR-V][AMDGPU] Tweak AMDGCNSPIRV ABI to allow for the correct handling of aggregates passed to kernels / functions. llvm#102776 Change-Id: I2b9ab54aba1c9345b9b0eb84409e6ed6c3cdb6cd
This adds the TTI predicate for conveying the availability of fast
popcnt
, which subsequently allows passes likeLoopIdiomRecognize
to collapse known popcount patterns. Since SPIR-V natively exposesOpBitcount
, it seems preferable to compress the resulting code, and retain the information, even if a concrete target might have to expand back into a loop structure.