Skip to content

Commit 817cd72

Browse files
authored
[AMDGPU] Fix folding clamp into pseudo scalar instructions (#100568)
Clamp is canonically a v_max* instruction with a VGPR dst. Folding clamp into a pseudo scalar instruction can cause issues due to a change in regbank. We fix this with a copy.
1 parent 3295d37 commit 817cd72

File tree

2 files changed

+38
-1
lines changed

2 files changed

+38
-1
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1581,7 +1581,18 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
15811581

15821582
// Clamp is applied after omod, so it is OK if omod is set.
15831583
DefClamp->setImm(1);
1584-
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1584+
1585+
Register DefReg = Def->getOperand(0).getReg();
1586+
Register MIDstReg = MI.getOperand(0).getReg();
1587+
if (TRI->isSGPRReg(*MRI, DefReg)) {
1588+
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1589+
// instruction with a VGPR dst.
1590+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
1591+
MIDstReg)
1592+
.addReg(DefReg);
1593+
} else {
1594+
MRI->replaceRegWith(MIDstReg, DefReg);
1595+
}
15851596
MI.eraseFromParent();
15861597

15871598
// Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
3+
---
4+
name: test
5+
tracksRegLiveness: true
6+
body: |
7+
bb.0:
8+
liveins: $sgpr0
9+
10+
; CHECK-LABEL: name: test
11+
; CHECK: liveins: $sgpr0
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
14+
; CHECK-NEXT: [[V_S_RSQ_F32_e64_:%[0-9]+]]:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
15+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_S_RSQ_F32_e64_]]
16+
; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[COPY1]], [[COPY1]], implicit $mode, implicit $exec
17+
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e32_]]
18+
; CHECK-NEXT: S_ENDPGM 0
19+
%0:sgpr_32 = COPY $sgpr0
20+
%1:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, %0, 0, 0, implicit $mode, implicit $exec
21+
%2:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %1, 0, %1, -1, 0, implicit $mode, implicit $exec
22+
%3:vgpr_32 = nofpexcept V_ADD_F32_e32 %2:vgpr_32, %2:vgpr_32, implicit $mode, implicit $exec
23+
$vgpr0 = COPY %3
24+
S_ENDPGM 0
25+
26+
...

0 commit comments

Comments
 (0)