-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[NVPTX] remove store.params of undef #96940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesExtend the logic for Full diff: https://github.com/llvm/llvm-project/pull/96940.diff 2 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index e3201516e4e7f..476a532db0a37 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5345,15 +5345,26 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
+ std::size_t Back) {
+ if (all_of(N->ops().drop_front(Front).drop_back(Back),
+ [](const SDUse &U) { return U.get()->isUndef(); }))
+ // Operand 0 is the previous value in the chain. Cannot return EntryToken
+ // as the previous value will become unused and eliminated later.
+ return N->getOperand(0);
+
+ return SDValue();
+}
+
+static SDValue PerformStoreParamCombine(SDNode *N) {
+ // Operands from the 3rd to the 2nd last one are the values to be stored.
+ // {Chain, ArgID, Offset, Val, Glue}
+ return PerformStoreCombineHelper(N, 3, 1);
+}
+
static SDValue PerformStoreRetvalCombine(SDNode *N) {
// Operands from the 2nd to the last one are the values to be stored
- for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
- if (!N->getOperand(I).isUndef())
- return SDValue();
-
- // Operand 0 is the previous value in the chain. Cannot return EntryToken
- // as the previous value will become unused and eliminated later.
- return N->getOperand(0);
+ return PerformStoreCombineHelper(N, 2, 0);
}
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
@@ -5822,6 +5833,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
return SDValue();
+ // Don't mess with undef values as sra may be simplified to 0, not undef.
+ if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
+ return SDValue();
+
uint64_t VectorBits = VectorVT.getSizeInBits();
// We only handle the types we can extract in-register.
if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
@@ -5950,6 +5965,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
return PerformStoreRetvalCombine(N);
+ case NVPTXISD::StoreParam:
+ case NVPTXISD::StoreParamV2:
+ case NVPTXISD::StoreParamV4:
+ return PerformStoreParamCombine(N);
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACTCombine(N, DCI);
case ISD::VSELECT:
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
new file mode 100644
index 0000000000000..e281017e802d1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.T = type { i64, <2 x i32>, <4 x i32> }
+
+declare void @test_call(%struct.T)
+
+define void @test_store_param_undef() {
+; CHECK-LABEL: test_store_param_undef(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 16 .b8 param0[32];
+; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r1, 0};
+; CHECK-NEXT: call.uni
+; CHECK-NEXT: test_call,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: ret;
+ call void @test_call(%struct.T undef)
+ ret void
+}
+
+define void @test_store_param_def(i64 %param0, i32 %param1) {
+; CHECK-LABEL: test_store_param_def(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_store_param_def_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [test_store_param_def_param_1];
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .align 16 .b8 param0[32];
+; CHECK-NEXT: st.param.b64 [param0+0], %rd1;
+; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r2, %r1};
+; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5};
+; CHECK-NEXT: call.uni
+; CHECK-NEXT: test_call,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: ret;
+ %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1
+ %V4 = insertelement <4 x i32> undef, i32 %param1, i32 1
+ %S0 = insertvalue %struct.T undef, i64 %param0, 0
+ %S1 = insertvalue %struct.T %S0, <2 x i32> %V2, 1
+ %S2 = insertvalue %struct.T %S1, <4 x i32> %V4, 2
+ call void @test_call(%struct.T %S2)
+ ret void
+}
+
+define void @test_store_undef(ptr %out) {
+; CHECK-LABEL: test_store_undef(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ret;
+ store %struct.T undef, ptr %out
+ ret void
+}
+
+define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
+; CHECK-LABEL: test_store_def(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_store_def_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [test_store_def_param_1];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_store_def_param_2];
+; CHECK-NEXT: st.v4.u32 [%rd2+16], {%r2, %r1, %r3, %r4};
+; CHECK-NEXT: st.v2.u32 [%rd2+8], {%r5, %r1};
+; CHECK-NEXT: st.u64 [%rd2], %rd1;
+; CHECK-NEXT: ret;
+ %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1
+ %V4 = insertelement <4 x i32> undef, i32 %param1, i32 1
+ %S0 = insertvalue %struct.T undef, i64 %param0, 0
+ %S1 = insertvalue %struct.T %S0, <2 x i32> %V2, 1
+ %S2 = insertvalue %struct.T %S1, <4 x i32> %V4, 2
+ store %struct.T %S2, ptr %out
+ ret void
+}
|
@@ -0,0 +1,93 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Up to you, but personally I feel like, we might as well write the tests manually instead of using this tool? Or, use the tool, and then delete the lines in the tests that are not relevant?
Like we don't care about the CHECK-EMPTY's and the lines containing comments.
The advantage of this is that, if we ever change the codegen format (and it does change sometimes!), we won't have to regenerate this test and then figure out if the changes are "meaningful".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's funny, @Artem-B has frequently requested that I use this tool. If it's alright with you I'll keep this test as is, though it would be nice to have consensus on what type of tests we prefer, I wonder if the auto generation script could be updated to make the checks that are generated more flexible.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Heh, okay, I defer to Art. :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Auto-generator is not perfect, but it has two major benefits:
- it's automatic, so the author only need to write the input code, not the checks
- it includes checks for everything, so when something goes wrong I can reason about it without guessing whether the changed output is OK or not, based on hand-written checks that tend to be spotty.
Human-written tests can be good, but usually aren't. The effort of writing a good test falls somewhere between tedious and infeasible. I'll take a good enough automatic test, even if it's overly verbose. It also lowers the review burden as we no longer need to argue about the quality of the checks in the test. Just concentrate on what we're testing and that the generated code is valid.
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: { // callseq 0, 0 | ||
; CHECK-NEXT: .param .align 16 .b8 param0[32]; | ||
; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r1, 0}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm unclear on what this one is testing. If the parameter is undef then why do we store into part of it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apologies, this test was out of date, I forgot to add the latest version to the commit, the new version of this test does not include this line.
; CHECK-NEXT: { // callseq 1, 0 | ||
; CHECK-NEXT: .param .align 16 .b8 param0[32]; | ||
; CHECK-NEXT: st.param.b64 [param0+0], %rd1; | ||
; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r2, %r1}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similarly I'm unclear on why we expect to have a v2.b32 and v4.b32 store here? It seems like only one value in each of these struct fields has a non-undef value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is correct, this is similar to the existing DAG combining behavior for normal stores: only when the value is completely undef is the store removed. I agree the logic could be improved to narrow the store but I think this is beyond the scope of this change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sg
; CHECK-NEXT: ld.param.u64 %rd2, [test_store_def_param_2]; | ||
; CHECK-NEXT: st.v4.u32 [%rd2+16], {%r2, %r1, %r3, %r4}; | ||
; CHECK-NEXT: st.v2.u32 [%rd2+8], {%r5, %r1}; | ||
; CHECK-NEXT: st.u64 [%rd2], %rd1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here. Why do we store the full struct when most of it is undef?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See above.
6fb7068
to
41bf310
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/154/builds/677 Here is the relevant piece of the build log for the reference:
|
Extend the logic for `NVPTXISD::StoreRetval` to `NVPTXISD::StoreParam` to remove stores of undef values.
Extend the logic for
NVPTXISD::StoreRetval
toNVPTXISD::StoreParam
to remove stores of undef values.