Skip to content

Commit 7c24041

Browse files
authored
[AMDGPU][True16][CodeGen] reopen "FLAT_load using D16 pseudo instruction" (#127673)
Previous patch is merged #114500 and it hit a buildbot failure and thus reverted It seems the AMDGPU::OpName::OPERAND_LAST is removed at the meantime when previous patch is merged and that's causing the compile error. Fixed and reopen it here
1 parent 9bf582f commit 7c24041

File tree

8 files changed

+217
-54
lines changed

8 files changed

+217
-54
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,63 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
114114
llvm_unreachable("unknown operand type");
115115
}
116116

117-
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
117+
// Lower true16 D16 Pseudo instruction to d16_lo/d16_hi MCInst based on
118+
// Dst/Data's .l/.h selection
119+
void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
120+
MCInst &OutMI) const {
118121
unsigned Opcode = MI->getOpcode();
119122
const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
123+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
124+
const auto *Info = AMDGPU::getT16D16Helper(Opcode);
125+
126+
llvm::AMDGPU::OpName OpName;
127+
if (TII->isDS(Opcode)) {
128+
if (MI->mayLoad())
129+
OpName = llvm::AMDGPU::OpName::vdst;
130+
else if (MI->mayStore())
131+
OpName = llvm::AMDGPU::OpName::data0;
132+
else
133+
llvm_unreachable("LDS load or store expected");
134+
} else {
135+
OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
136+
? llvm::AMDGPU::OpName::vdata
137+
: llvm::AMDGPU::OpName::vdst;
138+
}
139+
140+
// select Dst/Data
141+
int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
142+
const MachineOperand &MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
143+
144+
// select hi/lo MCInst
145+
bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
146+
Opcode = IsHi ? Info->HiOp : Info->LoOp;
147+
148+
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
149+
assert(MCOpcode != -1 &&
150+
"Pseudo instruction doesn't have a target-specific version");
151+
OutMI.setOpcode(MCOpcode);
152+
153+
// lower operands
154+
for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
155+
const MachineOperand &MO = MI->getOperand(I);
156+
MCOperand MCOp;
157+
if (I == VDstOrVDataIdx)
158+
MCOp = MCOperand::createReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
159+
else
160+
lowerOperand(MO, MCOp);
161+
OutMI.addOperand(MCOp);
162+
}
163+
164+
if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
165+
MCOperand MCOp;
166+
lowerOperand(MIVDstOrVData, MCOp);
167+
OutMI.addOperand(MCOp);
168+
}
169+
}
170+
171+
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
172+
unsigned Opcode = MI->getOpcode();
173+
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
120174

121175
// FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
122176
// need to select it to the subtarget specific version, and there's no way to
@@ -137,6 +191,9 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
137191
Opcode == AMDGPU::SI_TCRETURN_GFX) {
138192
// TODO: How to use branch immediate and avoid register+add?
139193
Opcode = AMDGPU::S_SETPC_B64;
194+
} else if (AMDGPU::getT16D16Helper(Opcode)) {
195+
lowerT16D16Helper(MI, OutMI);
196+
return;
140197
}
141198

142199
int MCOpcode = TII->pseudoToMCOpcode(Opcode);

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class AMDGPUMCInstLower {
3939

4040
/// Lower a MachineInstr to an MCInst
4141
void lower(const MachineInstr *MI, MCInst &OutMI) const;
42+
43+
void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
4244
};
4345

4446
namespace {

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ let WantsRoot = true in {
1616
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
1717
}
1818

19+
class True16D16Table <string hiOp, string loOp> {
20+
Instruction T16Op = !cast<Instruction>(NAME);
21+
Instruction HiOp = !cast<Instruction>(hiOp);
22+
Instruction LoOp = !cast<Instruction>(loOp);
23+
}
24+
1925
//===----------------------------------------------------------------------===//
2026
// FLAT classes
2127
//===----------------------------------------------------------------------===//
@@ -226,6 +232,12 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
226232
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
227233
}
228234

235+
multiclass FLAT_Load_Pseudo_t16<string opName> {
236+
def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
237+
let True16Predicate = UseRealTrue16Insts in
238+
def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
239+
}
240+
229241
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
230242
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
231243
opName,
@@ -662,12 +674,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
662674

663675
let SubtargetPredicate = HasD16LoadStore in {
664676
let TiedSourceNotRead = 1 in {
665-
def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
666677
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
667-
def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
678+
defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
668679
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
669-
def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
680+
defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
670681
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
682+
defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
671683
}
672684

673685
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
@@ -1049,6 +1061,11 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
10491061
(inst $vaddr, $offset, 0, $in)
10501062
>;
10511063

1064+
class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
1065+
(vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
1066+
(inst $vaddr, $offset, (i32 0))
1067+
>;
1068+
10521069
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
10531070
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
10541071
(inst $vaddr, $offset, 0, $in)
@@ -1371,16 +1388,29 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
13711388
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
13721389
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
13731390
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
1374-
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
1375-
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
1376-
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
13771391
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
13781392
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
1379-
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
13801393
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
13811394
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
13821395
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
13831396

1397+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
1398+
let True16Predicate = p in {
1399+
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
1400+
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
1401+
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
1402+
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
1403+
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
1404+
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
1405+
}
1406+
1407+
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
1408+
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
1409+
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
1410+
def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
1411+
def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
1412+
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
1413+
13841414
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
13851415
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
13861416

@@ -2761,3 +2791,11 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
27612791

27622792
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
27632793
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
2794+
2795+
def True16D16Table : GenericTable {
2796+
let FilterClass = "True16D16Table";
2797+
let CppTypeName = "True16D16Info";
2798+
let Fields = ["T16Op", "HiOp", "LoOp"];
2799+
let PrimaryKey = ["T16Op"];
2800+
let PrimaryKeyName = "getT16D16Helper";
2801+
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2483,8 +2483,15 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
24832483

24842484
// Return an AGPR+VGPR operand class for the given VGPR register class.
24852485
class getLdStRegisterOperand<RegisterClass RC> {
2486+
// This type of operands is only used in pseudo instructions helping
2487+
// code generation and thus doesn't need encoding and decoding methods.
2488+
// It also doesn't need to support AGPRs, because GFX908/A/40 do not
2489+
// support True16.
2490+
defvar VLdSt_16 = RegisterOperand<VGPR_16>;
2491+
24862492
RegisterOperand ret =
2487-
!cond(!eq(RC.Size, 32) : AVLdSt_32,
2493+
!cond(!eq(RC.Size, 16) : VLdSt_16,
2494+
!eq(RC.Size, 32) : AVLdSt_32,
24882495
!eq(RC.Size, 64) : AVLdSt_64,
24892496
!eq(RC.Size, 96) : AVLdSt_96,
24902497
!eq(RC.Size, 128) : AVLdSt_128,

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ struct FP4FP8DstByteSelInfo {
430430
#define GET_VOPDPairs_IMPL
431431
#define GET_VOPTrue16Table_DECL
432432
#define GET_VOPTrue16Table_IMPL
433+
#define GET_True16D16Table_IMPL
433434
#define GET_WMMAOpcode2AddrMappingTable_DECL
434435
#define GET_WMMAOpcode2AddrMappingTable_IMPL
435436
#define GET_WMMAOpcode3AddrMappingTable_DECL

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,12 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
113113
unsigned Opcode;
114114
};
115115

116+
struct True16D16Info {
117+
unsigned T16Op;
118+
unsigned HiOp;
119+
unsigned LoOp;
120+
};
121+
116122
#define GET_MIMGBaseOpcode_DECL
117123
#define GET_MIMGDim_DECL
118124
#define GET_MIMGEncoding_DECL
@@ -123,6 +129,7 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
123129
#define GET_MAIInstInfoTable_DECL
124130
#define GET_isMFMA_F8F6F4Table_DECL
125131
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
132+
#define GET_True16D16Table_DECL
126133
#include "AMDGPUGenSearchableTables.inc"
127134

128135
namespace IsaInfo {

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
44
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
55
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
7-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
8+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
9+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
810

911
define <2 x half> @chain_hi_to_lo_private() {
1012
; GFX900-LABEL: chain_hi_to_lo_private:
@@ -156,14 +158,23 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
156158
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
157159
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
158160
;
159-
; GFX11-LABEL: chain_hi_to_lo_arithmatic:
160-
; GFX11: ; %bb.0: ; %bb
161-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162-
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
163-
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
164-
; GFX11-NEXT: s_waitcnt vmcnt(0)
165-
; GFX11-NEXT: v_mov_b32_e32 v0, v1
166-
; GFX11-NEXT: s_setpc_b64 s[30:31]
161+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic:
162+
; GFX11-TRUE16: ; %bb.0: ; %bb
163+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
165+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
166+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
167+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
168+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
169+
;
170+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic:
171+
; GFX11-FAKE16: ; %bb.0: ; %bb
172+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
174+
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
175+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
176+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
177+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
167178
bb:
168179
%arith_lo = fadd half %in, 1.0
169180
%load_hi = load half, ptr addrspace(5) %base
@@ -361,18 +372,31 @@ define <2 x half> @chain_hi_to_lo_flat() {
361372
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
362373
; GFX10-NEXT: s_setpc_b64 s[30:31]
363374
;
364-
; GFX11-LABEL: chain_hi_to_lo_flat:
365-
; GFX11: ; %bb.0: ; %bb
366-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367-
; GFX11-NEXT: v_mov_b32_e32 v0, 2
368-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
369-
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
370-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
371-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
372-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
373-
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
374-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
375-
; GFX11-NEXT: s_setpc_b64 s[30:31]
375+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
376+
; GFX11-TRUE16: ; %bb.0: ; %bb
377+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
379+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
380+
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
381+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
382+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
383+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
384+
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
385+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
386+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
387+
;
388+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
389+
; GFX11-FAKE16: ; %bb.0: ; %bb
390+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
392+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
393+
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
394+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
395+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
396+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397+
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
398+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
399+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
376400
bb:
377401
%gep_lo = getelementptr inbounds half, ptr null, i64 1
378402
%load_lo = load half, ptr %gep_lo
@@ -403,14 +427,23 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
403427
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
404428
; GFX10-NEXT: s_setpc_b64 s[30:31]
405429
;
406-
; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
407-
; GFX11: ; %bb.0: ; %bb
408-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409-
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
410-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
411-
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
412-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
413-
; GFX11-NEXT: s_setpc_b64 s[30:31]
430+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
431+
; GFX11-TRUE16: ; %bb.0: ; %bb
432+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433+
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
434+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
435+
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
436+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
437+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
438+
;
439+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
440+
; GFX11-FAKE16: ; %bb.0: ; %bb
441+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442+
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
443+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
444+
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
445+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
446+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
414447
bb:
415448
%load_lo = load half, ptr %base_lo
416449
%load_hi = load half, ptr %base_hi
@@ -864,17 +897,31 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
864897
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
865898
; GFX10-NEXT: s_setpc_b64 s[30:31]
866899
;
867-
; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
868-
; GFX11: ; %bb.0: ; %bb
869-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870-
; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
871-
; GFX11-NEXT: s_waitcnt vmcnt(0)
872-
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
873-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
874-
; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
875-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
876-
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
877-
; GFX11-NEXT: s_setpc_b64 s[30:31]
900+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
901+
; GFX11-TRUE16: ; %bb.0: ; %bb
902+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903+
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
904+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
905+
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
906+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
907+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
908+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
909+
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
910+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
911+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
912+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
913+
;
914+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
915+
; GFX11-FAKE16: ; %bb.0: ; %bb
916+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
917+
; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
918+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
919+
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
920+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
921+
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
922+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
923+
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
924+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
878925
bb:
879926
%gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
880927
%load_lo = load volatile i16, ptr addrspace(0) %gep_lo

0 commit comments

Comments
 (0)