Skip to content

Commit b576a6b

Browse files
authored
[X86][AMX] Fix a bug after #83628 (#91207)
We need to check if `GR64Cand` a valid register before using it. Test is not needed since it's covered in llvm-test-suite. Fixes #90954
1 parent 4527adc commit b576a6b

File tree

2 files changed

+155
-2
lines changed

2 files changed

+155
-2
lines changed

llvm/lib/Target/X86/X86LowerTileCopy.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
146146
addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
147147
.addReg(SrcReg, getKillRegState(SrcMO.isKill()));
148148
MachineOperand &MO = NewMI->getOperand(2);
149-
MO.setReg(GR64Cand);
149+
MO.setReg(GR64Cand ? GR64Cand : X86::RAX);
150150
MO.setIsKill(true);
151151
// tileloadd (%sp, %idx), %tmm
152152
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -157,7 +157,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
157157
// restore %rax
158158
// mov (%sp) %rax
159159
addFrameReference(
160-
BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
160+
BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), X86::RAX), StrideSS);
161161
}
162162
MI.eraseFromParent();
163163
Changed = true;

llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,156 @@ declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_
5151
declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
5252
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
5353
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
54+
55+
define void @PR90954(ptr %0, ptr %1, i32 %2) {
56+
; CHECK-LABEL: PR90954:
57+
; CHECK: # %bb.0:
58+
; CHECK-NEXT: pushq %rbp
59+
; CHECK-NEXT: .cfi_def_cfa_offset 16
60+
; CHECK-NEXT: .cfi_offset %rbp, -16
61+
; CHECK-NEXT: movq %rsp, %rbp
62+
; CHECK-NEXT: .cfi_def_cfa_register %rbp
63+
; CHECK-NEXT: pushq %r15
64+
; CHECK-NEXT: pushq %r14
65+
; CHECK-NEXT: pushq %r13
66+
; CHECK-NEXT: pushq %r12
67+
; CHECK-NEXT: pushq %rbx
68+
; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
69+
; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
70+
; CHECK-NEXT: .cfi_offset %rbx, -56
71+
; CHECK-NEXT: .cfi_offset %r12, -48
72+
; CHECK-NEXT: .cfi_offset %r13, -40
73+
; CHECK-NEXT: .cfi_offset %r14, -32
74+
; CHECK-NEXT: .cfi_offset %r15, -24
75+
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
76+
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
77+
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
78+
; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
79+
; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
80+
; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
81+
; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
82+
; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
83+
; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
84+
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
85+
; CHECK-NEXT: shll $4, %edx
86+
; CHECK-NEXT: xorl %eax, %eax
87+
; CHECK-NEXT: movw $64, %cx
88+
; CHECK-NEXT: movw $16, %di
89+
; CHECK-NEXT: movb $1, %r8b
90+
; CHECK-NEXT: movl $64, %r9d
91+
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10
92+
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11
93+
; CHECK-NEXT: xorl %ebx, %ebx
94+
; CHECK-NEXT: xorl %r14d, %r14d
95+
; CHECK-NEXT: jmp .LBB1_1
96+
; CHECK-NEXT: .p2align 4, 0x90
97+
; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1
98+
; CHECK-NEXT: incq %r14
99+
; CHECK-NEXT: addl %edx, %ebx
100+
; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1
101+
; CHECK-NEXT: # Child Loop BB1_2 Depth 2
102+
; CHECK-NEXT: movslq %ebx, %r15
103+
; CHECK-NEXT: leaq (%rsi,%r15,4), %r15
104+
; CHECK-NEXT: xorl %r12d, %r12d
105+
; CHECK-NEXT: xorl %r13d, %r13d
106+
; CHECK-NEXT: jmp .LBB1_2
107+
; CHECK-NEXT: .p2align 4, 0x90
108+
; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2
109+
; CHECK-NEXT: tilestored %tmm1, (%r15,%rax)
110+
; CHECK-NEXT: incq %r13
111+
; CHECK-NEXT: addq $64, %r15
112+
; CHECK-NEXT: decq %r12
113+
; CHECK-NEXT: je .LBB1_5
114+
; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1
115+
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
116+
; CHECK-NEXT: tilezero %tmm0
117+
; CHECK-NEXT: tilezero %tmm1
118+
; CHECK-NEXT: testb %r8b, %r8b
119+
; CHECK-NEXT: jne .LBB1_4
120+
; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2
121+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
122+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
123+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
124+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
125+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
126+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
127+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
128+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
129+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
130+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
131+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
132+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
133+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
134+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
135+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
136+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
137+
; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1
138+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
139+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
140+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
141+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
142+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
143+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
144+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
145+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
146+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
147+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
148+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
149+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
150+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
151+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
152+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
153+
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
154+
; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2
155+
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
156+
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
157+
; CHECK-NEXT: movabsq $64, %rax
158+
; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
159+
; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
160+
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
161+
; CHECK-NEXT: jmp .LBB1_4
162+
%4 = shl i32 %2, 4
163+
%5 = icmp eq i64 0, 0
164+
br label %6
165+
166+
6: ; preds = %31, %3
167+
%7 = phi i64 [ 0, %3 ], [ %32, %31 ]
168+
%8 = trunc nuw nsw i64 %7 to i32
169+
%9 = mul i32 %4, %8
170+
%10 = mul i32 0, %8
171+
%11 = sext i32 %9 to i64
172+
%12 = getelementptr inbounds i32, ptr %1, i64 %11
173+
br label %13
174+
175+
13: ; preds = %25, %6
176+
%14 = phi i64 [ %29, %25 ], [ 0, %6 ]
177+
%15 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
178+
%16 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %15)
179+
%17 = shl nsw i64 %14, 4
180+
%18 = getelementptr i32, ptr %0, i64 %17
181+
br i1 %5, label %25, label %19
182+
183+
19: ; preds = %13
184+
%20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
185+
%21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
186+
%22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
187+
%23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %20, x86_amx %21, x86_amx %22)
188+
%24 = tail call noundef <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %23)
189+
br label %25
190+
191+
25: ; preds = %19, %13
192+
%26 = phi <256 x i32> [ undef, %13 ], [ %24, %19 ]
193+
%27 = getelementptr inbounds i32, ptr %12, i64 %17
194+
%28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %26)
195+
tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %27, i64 0, x86_amx %28)
196+
%29 = add nuw nsw i64 %14, 1
197+
%30 = icmp eq i64 %29, 0
198+
br i1 %30, label %31, label %13
199+
200+
31: ; preds = %25
201+
%32 = add nuw nsw i64 %7, 1
202+
br label %6
203+
}
204+
205+
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
206+
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)

0 commit comments

Comments
 (0)