Skip to content

Commit c18eebf

Browse files
committed
[InitUndef] Enable the InitUndef pass on all targets
The InitUndef pass works around a register allocation issue, where undef operands can be allocated to the same register as early-clobber result operands. This may lead to ISA constraint violations, where certain input and output registers are not allowed to overlap. Originally this pass was implemented for RISCV, and then extended to ARM in llvm#77770. I've since removed the target-specific parts of the pass in llvm#106744 and llvm#107885. This PR now enables the pass for all targets. The motivating case is the one in arm64-ldxr-stxr.ll for the AArch64 target, where we were previously incorrectly allocating a stxp input and output to the same register.
1 parent 37cf39f commit c18eebf

17 files changed

+247
-304
lines changed

llvm/include/llvm/CodeGen/TargetRegisterInfo.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,19 +1203,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
12031203
virtual bool isNonallocatableRegisterCalleeSave(MCRegister Reg) const {
12041204
return false;
12051205
}
1206-
1207-
/// Returns if the architecture being targeted has the required Pseudo
1208-
/// Instructions for initializing the register. By default this returns false,
1209-
/// but where it is overriden for an architecture, the behaviour will be
1210-
/// different. This can either be a check to ensure the Register Class is
1211-
/// present, or to return true as an indication the architecture supports the
1212-
/// pass. If using the method that does not check for the Register Class, it
1213-
/// is imperative to ensure all required Pseudo Instructions are implemented,
1214-
/// otherwise compilation may fail with an `Unexpected register class` error.
1215-
virtual bool
1216-
doesRegClassHavePseudoInitUndef(const TargetRegisterClass *RC) const {
1217-
return false;
1218-
}
12191206
};
12201207

12211208
//===----------------------------------------------------------------------===//

llvm/include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -332,12 +332,6 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
332332

333333
/// Get the list of MacroFusion predicates.
334334
virtual std::vector<MacroFusionPredTy> getMacroFusions() const { return {}; };
335-
336-
/// supportsInitUndef is used to determine if an architecture supports
337-
/// the Init Undef Pass. By default, it is assumed that it will not support
338-
/// the pass, with architecture specific overrides providing the information
339-
/// where they are implemented.
340-
virtual bool supportsInitUndef() const { return false; }
341335
};
342336

343337
} // end namespace llvm

llvm/lib/CodeGen/InitUndef.cpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,6 @@ bool InitUndef::handleReg(MachineInstr *MI) {
120120
continue;
121121
if (!UseMO.getReg().isVirtual())
122122
continue;
123-
if (!TRI->doesRegClassHavePseudoInitUndef(MRI->getRegClass(UseMO.getReg())))
124-
continue;
125123

126124
if (UseMO.isUndef() || findImplictDefMIFromReg(UseMO.getReg(), MRI))
127125
Changed |= fixupIllOperand(MI, UseMO);
@@ -140,8 +138,6 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
140138
continue;
141139
if (UseMO.isTied())
142140
continue;
143-
if (!TRI->doesRegClassHavePseudoInitUndef(MRI->getRegClass(UseMO.getReg())))
144-
continue;
145141

146142
Register Reg = UseMO.getReg();
147143
if (NewRegs.count(Reg))
@@ -245,16 +241,6 @@ bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
245241

246242
bool InitUndef::runOnMachineFunction(MachineFunction &MF) {
247243
ST = &MF.getSubtarget();
248-
249-
// supportsInitUndef is implemented to reflect if an architecture has support
250-
// for the InitUndef pass. Support comes from having the relevant Pseudo
251-
// instructions that can be used to initialize the register. The function
252-
// returns false by default so requires an implementation per architecture.
253-
// Support can be added by overriding the function in a way that best fits
254-
// the architecture.
255-
if (!ST->supportsInitUndef())
256-
return false;
257-
258244
MRI = &MF.getRegInfo();
259245
TII = ST->getInstrInfo();
260246
TRI = MRI->getTargetRegisterInfo();

llvm/lib/Target/ARM/ARMBaseRegisterInfo.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -240,20 +240,6 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
240240
unsigned SrcSubReg) const override;
241241

242242
int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
243-
244-
bool doesRegClassHavePseudoInitUndef(
245-
const TargetRegisterClass *RC) const override {
246-
(void)RC;
247-
// For the ARM Architecture we want to always return true because all
248-
// required PseudoInitUndef types have been added. If compilation fails due
249-
// to `Unexpected register class`, this is likely to be because the specific
250-
// register being used is not support by Init Undef and needs the Pseudo
251-
// Instruction adding to ARMInstrInfo.td. If this is implemented as a
252-
// conditional check, this could create a false positive where Init Undef is
253-
// not running, skipping the instruction and moving to the next. This could
254-
// lead to illegal instructions being generated by the register allocator.
255-
return true;
256-
}
257243
};
258244

259245
} // end namespace llvm

llvm/lib/Target/ARM/ARMSubtarget.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -209,13 +209,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
209209
return &InstrInfo->getRegisterInfo();
210210
}
211211

212-
/// The correct instructions have been implemented to initialize undef
213-
/// registers, therefore the ARM Architecture is supported by the Init Undef
214-
/// Pass. This will return true as the pass needs to be supported for all
215-
/// types of instructions. The pass will then perform more checks to ensure it
216-
/// should be applying the Pseudo Instructions.
217-
bool supportsInitUndef() const override { return true; }
218-
219212
const CallLowering *getCallLowering() const override;
220213
InstructionSelector *getInstructionSelector() const override;
221214
const LegalizerInfo *getLegalizerInfo() const override;

llvm/lib/Target/RISCV/RISCVRegisterInfo.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,6 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
130130
const MachineFunction &MF, const VirtRegMap *VRM,
131131
const LiveRegMatrix *Matrix) const override;
132132

133-
bool doesRegClassHavePseudoInitUndef(
134-
const TargetRegisterClass *RC) const override {
135-
return isVRRegClass(RC);
136-
}
137-
138133
static bool isVRRegClass(const TargetRegisterClass *RC) {
139134
return RISCVRI::isVRegClass(RC->TSFlags) &&
140135
RISCVRI::getNF(RC->TSFlags) == 1;

llvm/lib/Target/RISCV/RISCVSubtarget.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
306306
unsigned getTailDupAggressiveThreshold() const {
307307
return TuneInfo->TailDupAggressiveThreshold;
308308
}
309-
310-
bool supportsInitUndef() const override { return hasVInstructions(); }
311309
};
312310
} // End llvm namespace
313311

llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,11 +354,10 @@ define dso_local i32 @test_store_release_i64(i32, i64 %val, ptr %addr) {
354354
}
355355

356356
; The stxp result cannot be allocated to the same register as the inputs.
357-
; FIXME: This is a miscompile.
358357
define dso_local i32 @test_stxp_undef(ptr %p, i64 %x) nounwind {
359358
; CHECK-LABEL: test_stxp_undef:
360359
; CHECK: // %bb.0:
361-
; CHECK-NEXT: stxp w8, x8, x1, [x0]
360+
; CHECK-NEXT: stxp w8, x9, x1, [x0]
362361
; CHECK-NEXT: mov w0, w8
363362
; CHECK-NEXT: ret
364363
%res = call i32 @llvm.aarch64.stxp(i64 undef, i64 %x, ptr %p)

llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll

Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,9 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
3333
; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
3434
; GFX11-NEXT: s_waitcnt vmcnt(0)
3535
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
36-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
37-
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
38-
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
39-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40-
; GFX11-NEXT: v_mov_b32_e32 v5, v7
36+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37+
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v0, v3, v[5:6]
38+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v1, v2, v[7:8]
4139
; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
4240
; GFX11-NEXT: s_nop 0
4341
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -85,13 +83,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
8583
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
8684
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
8785
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7]
88-
; GFX11-NEXT: global_load_b32 v5, v2, s[0:1]
86+
; GFX11-NEXT: global_load_b32 v7, v2, s[0:1]
8987
; GFX11-NEXT: s_waitcnt vmcnt(0)
90-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
91-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
92-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
93-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
88+
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v7, 0
9489
; GFX11-NEXT: v_mov_b32_e32 v0, 0
90+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
91+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v1, v7, v[3:4]
92+
; GFX11-NEXT: v_mov_b32_e32 v3, v5
9593
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
9694
; GFX11-NEXT: s_nop 0
9795
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -138,14 +136,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
138136
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
139137
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
140138
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
141-
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7]
139+
; GFX11-NEXT: global_load_b32 v7, v1, s[6:7]
142140
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
143141
; GFX11-NEXT: s_waitcnt vmcnt(0)
144-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
145-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
146-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
147-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
142+
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v7, v0, 0
148143
; GFX11-NEXT: v_mov_b32_e32 v0, 0
144+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
145+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v1, v[3:4]
146+
; GFX11-NEXT: v_mov_b32_e32 v3, v5
149147
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
150148
; GFX11-NEXT: s_nop 0
151149
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -241,14 +239,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
241239
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
242240
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
243241
; GFX11-NEXT: s_clause 0x1
244-
; GFX11-NEXT: global_load_b32 v5, v0, s[6:7]
242+
; GFX11-NEXT: global_load_b32 v7, v0, s[6:7]
245243
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
246244
; GFX11-NEXT: s_waitcnt vmcnt(0)
247-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
248-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
249-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
250-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
245+
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v7, v0, 0
251246
; GFX11-NEXT: v_mov_b32_e32 v0, 0
247+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
248+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v1, v[3:4]
249+
; GFX11-NEXT: v_mov_b32_e32 v3, v5
252250
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
253251
; GFX11-NEXT: s_nop 0
254252
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -436,16 +434,14 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
436434
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
437435
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
438436
; GFX11-NEXT: s_waitcnt vmcnt(1)
439-
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
437+
; GFX11-NEXT: v_and_b32_e32 v0, 0xfff00000, v0
440438
; GFX11-NEXT: s_waitcnt vmcnt(0)
441439
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
442-
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
443-
; GFX11-NEXT: v_mov_b32_e32 v0, v5
444-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
445-
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
440+
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
441+
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v0, v3, v[5:6]
446442
; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
447-
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
448-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
443+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
444+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[7:8]
449445
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
450446
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[4:5]
451447
; GFX11-NEXT: s_nop 0
@@ -568,10 +564,12 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
568564
; GFX11-NEXT: s_waitcnt vmcnt(0)
569565
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
570566
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
571-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
572-
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
573-
; GFX11-NEXT: v_mov_b32_e32 v1, v3
567+
; GFX11-NEXT: v_mov_b32_e32 v3, v1
568+
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[3:4]
574569
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
570+
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
571+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
572+
; GFX11-NEXT: v_mov_b32_e32 v1, v6
575573
; GFX11-NEXT: .LBB10_2: ; %Flow
576574
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
577575
; GFX11-NEXT: s_cbranch_execz .LBB10_4

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
667667
; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5
668668
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1]
669669
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
670-
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
671-
; GFX11-NEXT: v_mov_b32_e32 v2, v9
670+
; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v2, v3, v[8:9]
671+
; GFX11-NEXT: v_mov_b32_e32 v2, v10
672672
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
673673
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
674674
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2691,9 +2691,9 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
26912691
; GFX11-NEXT: global_load_b32 v4, v[2:3], off
26922692
; GFX11-NEXT: s_waitcnt vmcnt(0)
26932693
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
2694-
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4
2695-
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
2696-
; GFX11-NEXT: v_mov_b32_e32 v3, v4
2694+
; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v4
2695+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, 0x50, v7, v[3:4]
2696+
; GFX11-NEXT: v_mov_b32_e32 v3, v5
26972697
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
26982698
; GFX11-NEXT: s_nop 0
26992699
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
550550
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
551551
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
552552
; GFX1164-NEXT: s_mov_b32 s6, -1
553-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
554553
; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
555554
; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
556555
; GFX1164-NEXT: s_nop 0
@@ -588,7 +587,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
588587
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
589588
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
590589
; GFX1132-NEXT: s_mov_b32 s6, -1
591-
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
592590
; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
593591
; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
594592
; GFX1132-NEXT: s_nop 0
@@ -2219,11 +2217,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
22192217
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
22202218
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
22212219
; GFX1164-NEXT: s_mov_b32 s6, -1
2222-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
22232220
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
2224-
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
2221+
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2222+
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
2223+
; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[3:4]
22252224
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
2226-
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
2225+
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
22272226
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
22282227
; GFX1164-NEXT: s_nop 0
22292228
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2265,11 +2264,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
22652264
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
22662265
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
22672266
; GFX1132-NEXT: s_mov_b32 s6, -1
2268-
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
22692267
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
2270-
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
2268+
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2269+
; GFX1132-NEXT: v_mov_b32_e32 v3, v1
2270+
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[3:4]
22712271
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
2272-
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
2272+
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
22732273
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
22742274
; GFX1132-NEXT: s_nop 0
22752275
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5918,11 +5918,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
59185918
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
59195919
; GFX1164-NEXT: s_mov_b32 s6, -1
59205920
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
5921-
; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
5921+
; GFX1164-NEXT: v_mad_u64_u32 v[6:7], null, s1, v2, v[4:5]
59225922
; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
59235923
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
59245924
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
5925-
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
5925+
; GFX1164-NEXT: v_mov_b32_e32 v1, v6
59265926
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
59275927
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
59285928
; GFX1164-NEXT: s_nop 0
@@ -5966,11 +5966,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
59665966
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
59675967
; GFX1132-NEXT: s_mov_b32 s6, -1
59685968
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
5969-
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
5969+
; GFX1132-NEXT: v_mad_u64_u32 v[6:7], null, s1, v2, v[4:5]
59705970
; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
59715971
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
59725972
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
5973-
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
5973+
; GFX1132-NEXT: v_mov_b32_e32 v1, v6
59745974
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
59755975
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
59765976
; GFX1132-NEXT: s_nop 0

0 commit comments

Comments
 (0)