Skip to content

Commit b083340

Browse files
authored
[llvm][NVPTX] Don't reorder MIs that construct a PTX function call (#116522)
With "-enable-misched", MachineScheduler can reorder MIs that must stick together (in initially set order) to generate legal PTX code for a function call. When generating PTX code for the attached test (using LLVM before this revision), the following invalid PTX code is generated: ``` { // callseq 0, 0 .param .b64 param0; st.param.f64 [param0], 0d0000000000000000; .param .b64 retval0; call.uni (retval0), mul.lo.s32 %r7, %r10, %r3; or.b32 %r8, %r4, %r7; mul.lo.s32 %r9, %r2, %r8; cvt.rn.f64.s32 %fd3, %r9; quux, ( param0 ); ld.param.f64 %fd1, [retval0]; } // callseq 0 ```
1 parent ad9c0b3 commit b083340

File tree

3 files changed

+82
-0
lines changed

3 files changed

+82
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,23 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
199199
BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
200200
return 2;
201201
}
202+
203+
bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
204+
const MachineBasicBlock *MBB,
205+
const MachineFunction &MF) const {
206+
// Prevent the scheduler from reordering & splitting up MachineInstrs
207+
// which must stick together (in initially set order) to
208+
// comprise a valid PTX function call sequence.
209+
switch (MI.getOpcode()) {
210+
case NVPTX::CallUniPrintCallRetInst1:
211+
case NVPTX::CallArgBeginInst:
212+
case NVPTX::CallArgI32imm:
213+
case NVPTX::CallArgParam:
214+
case NVPTX::LastCallArgI32imm:
215+
case NVPTX::LastCallArgParam:
216+
case NVPTX::CallArgEndInst1:
217+
return true;
218+
}
219+
220+
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
221+
}

llvm/lib/Target/NVPTX/NVPTXInstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
6767
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
6868
const DebugLoc &DL,
6969
int *BytesAdded = nullptr) const override;
70+
bool isSchedulingBoundary(const MachineInstr &MI,
71+
const MachineBasicBlock *MBB,
72+
const MachineFunction &MF) const override;
7073
};
7174

7275
} // namespace llvm
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -O3 -march=nvptx64 -enable-misched %s -o - | FileCheck %s
3+
4+
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
8+
; CHECK-LABEL: wombat(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b32 %r<11>;
11+
; CHECK-NEXT: .reg .b64 %rd<2>;
12+
; CHECK-NEXT: .reg .f64 %fd<6>;
13+
; CHECK-EMPTY:
14+
; CHECK-NEXT: // %bb.0: // %bb
15+
; CHECK-NEXT: ld.param.u32 %r4, [wombat_param_2];
16+
; CHECK-NEXT: ld.param.u32 %r3, [wombat_param_1];
17+
; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0];
18+
; CHECK-NEXT: mov.b32 %r10, 0;
19+
; CHECK-NEXT: mov.u64 %rd1, 0;
20+
; CHECK-NEXT: mov.b32 %r6, 1;
21+
; CHECK-NEXT: $L__BB0_1: // %bb3
22+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
23+
; CHECK-NEXT: { // callseq 0, 0
24+
; CHECK-NEXT: .param .b64 param0;
25+
; CHECK-NEXT: st.param.f64 [param0], 0d0000000000000000;
26+
; CHECK-NEXT: .param .b64 retval0;
27+
; CHECK-NEXT: call.uni (retval0),
28+
; CHECK-NEXT: quux,
29+
; CHECK-NEXT: (
30+
; CHECK-NEXT: param0
31+
; CHECK-NEXT: );
32+
; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3;
33+
; CHECK-NEXT: or.b32 %r8, %r4, %r7;
34+
; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8;
35+
; CHECK-NEXT: cvt.rn.f64.s32 %fd3, %r9;
36+
; CHECK-NEXT: ld.param.f64 %fd1, [retval0];
37+
; CHECK-NEXT: } // callseq 0
38+
; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10;
39+
; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3;
40+
; CHECK-NEXT: st.global.f64 [%rd1], %fd5;
41+
; CHECK-NEXT: mov.u32 %r10, %r6;
42+
; CHECK-NEXT: bra.uni $L__BB0_1;
43+
bb:
44+
br label %bb3
45+
46+
bb3: ; preds = %bb3, %bb
47+
%phi = phi i32 [ 0, %bb ], [ 1, %bb3 ]
48+
%call = tail call double @quux(double 0.000000e+00)
49+
%mul = mul i32 %phi, %arg1
50+
%or = or i32 %arg2, %mul
51+
%mul4 = mul i32 %arg, %or
52+
%sitofp = sitofp i32 %mul4 to double
53+
%uitofp = uitofp i32 %phi to double
54+
%fadd = fadd double %uitofp, %sitofp
55+
store double %fadd, ptr addrspace(1) null, align 8
56+
br label %bb3
57+
}
58+
59+
declare double @quux(double)

0 commit comments

Comments
 (0)