@@ -51,3 +51,156 @@ declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_
51
51
declare x86_amx @llvm.x86.tdpbuud.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
52
52
declare x86_amx @llvm.x86.tdpbf16ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
53
53
declare void @llvm.x86.tilestored64.internal (i16 , i16 , ptr , i64 , x86_amx)
54
+
55
+ define void @PR90954 (ptr %0 , ptr %1 , i32 %2 ) {
56
+ ; CHECK-LABEL: PR90954:
57
+ ; CHECK: # %bb.0:
58
+ ; CHECK-NEXT: pushq %rbp
59
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
60
+ ; CHECK-NEXT: .cfi_offset %rbp, -16
61
+ ; CHECK-NEXT: movq %rsp, %rbp
62
+ ; CHECK-NEXT: .cfi_def_cfa_register %rbp
63
+ ; CHECK-NEXT: pushq %r15
64
+ ; CHECK-NEXT: pushq %r14
65
+ ; CHECK-NEXT: pushq %r13
66
+ ; CHECK-NEXT: pushq %r12
67
+ ; CHECK-NEXT: pushq %rbx
68
+ ; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
69
+ ; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
70
+ ; CHECK-NEXT: .cfi_offset %rbx, -56
71
+ ; CHECK-NEXT: .cfi_offset %r12, -48
72
+ ; CHECK-NEXT: .cfi_offset %r13, -40
73
+ ; CHECK-NEXT: .cfi_offset %r14, -32
74
+ ; CHECK-NEXT: .cfi_offset %r15, -24
75
+ ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
76
+ ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
77
+ ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
78
+ ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
79
+ ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
80
+ ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
81
+ ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
82
+ ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
83
+ ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
84
+ ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
85
+ ; CHECK-NEXT: shll $4, %edx
86
+ ; CHECK-NEXT: xorl %eax, %eax
87
+ ; CHECK-NEXT: movw $64, %cx
88
+ ; CHECK-NEXT: movw $16, %di
89
+ ; CHECK-NEXT: movb $1, %r8b
90
+ ; CHECK-NEXT: movl $64, %r9d
91
+ ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10
92
+ ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11
93
+ ; CHECK-NEXT: xorl %ebx, %ebx
94
+ ; CHECK-NEXT: xorl %r14d, %r14d
95
+ ; CHECK-NEXT: jmp .LBB1_1
96
+ ; CHECK-NEXT: .p2align 4, 0x90
97
+ ; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1
98
+ ; CHECK-NEXT: incq %r14
99
+ ; CHECK-NEXT: addl %edx, %ebx
100
+ ; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1
101
+ ; CHECK-NEXT: # Child Loop BB1_2 Depth 2
102
+ ; CHECK-NEXT: movslq %ebx, %r15
103
+ ; CHECK-NEXT: leaq (%rsi,%r15,4), %r15
104
+ ; CHECK-NEXT: xorl %r12d, %r12d
105
+ ; CHECK-NEXT: xorl %r13d, %r13d
106
+ ; CHECK-NEXT: jmp .LBB1_2
107
+ ; CHECK-NEXT: .p2align 4, 0x90
108
+ ; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2
109
+ ; CHECK-NEXT: tilestored %tmm1, (%r15,%rax)
110
+ ; CHECK-NEXT: incq %r13
111
+ ; CHECK-NEXT: addq $64, %r15
112
+ ; CHECK-NEXT: decq %r12
113
+ ; CHECK-NEXT: je .LBB1_5
114
+ ; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1
115
+ ; CHECK-NEXT: # => This Inner Loop Header: Depth=2
116
+ ; CHECK-NEXT: tilezero %tmm0
117
+ ; CHECK-NEXT: tilezero %tmm1
118
+ ; CHECK-NEXT: testb %r8b, %r8b
119
+ ; CHECK-NEXT: jne .LBB1_4
120
+ ; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2
121
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
122
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
123
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
124
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
125
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
126
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
127
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
128
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
129
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
130
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
131
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
132
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
133
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
134
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
135
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
136
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
137
+ ; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1
138
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
139
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
140
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
141
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
142
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
143
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
144
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
145
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
146
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
147
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
148
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
149
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
150
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
151
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
152
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
153
+ ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
154
+ ; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2
155
+ ; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
156
+ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
157
+ ; CHECK-NEXT: movabsq $64, %rax
158
+ ; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
159
+ ; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
160
+ ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
161
+ ; CHECK-NEXT: jmp .LBB1_4
162
+ %4 = shl i32 %2 , 4
163
+ %5 = icmp eq i64 0 , 0
164
+ br label %6
165
+
166
+ 6 : ; preds = %31, %3
167
+ %7 = phi i64 [ 0 , %3 ], [ %32 , %31 ]
168
+ %8 = trunc nuw nsw i64 %7 to i32
169
+ %9 = mul i32 %4 , %8
170
+ %10 = mul i32 0 , %8
171
+ %11 = sext i32 %9 to i64
172
+ %12 = getelementptr inbounds i32 , ptr %1 , i64 %11
173
+ br label %13
174
+
175
+ 13 : ; preds = %25, %6
176
+ %14 = phi i64 [ %29 , %25 ], [ 0 , %6 ]
177
+ %15 = tail call x86_amx @llvm.x86.tilezero.internal (i16 16 , i16 64 )
178
+ %16 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %15 )
179
+ %17 = shl nsw i64 %14 , 4
180
+ %18 = getelementptr i32 , ptr %0 , i64 %17
181
+ br i1 %5 , label %25 , label %19
182
+
183
+ 19 : ; preds = %13
184
+ %20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %16 )
185
+ %21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > zeroinitializer )
186
+ %22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > zeroinitializer )
187
+ %23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal (i16 16 , i16 64 , i16 64 , x86_amx %20 , x86_amx %21 , x86_amx %22 )
188
+ %24 = tail call noundef <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %23 )
189
+ br label %25
190
+
191
+ 25 : ; preds = %19, %13
192
+ %26 = phi <256 x i32 > [ undef , %13 ], [ %24 , %19 ]
193
+ %27 = getelementptr inbounds i32 , ptr %12 , i64 %17
194
+ %28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %26 )
195
+ tail call void @llvm.x86.tilestored64.internal (i16 16 , i16 64 , ptr %27 , i64 0 , x86_amx %28 )
196
+ %29 = add nuw nsw i64 %14 , 1
197
+ %30 = icmp eq i64 %29 , 0
198
+ br i1 %30 , label %31 , label %13
199
+
200
+ 31 : ; preds = %25
201
+ %32 = add nuw nsw i64 %7 , 1
202
+ br label %6
203
+ }
204
+
205
+ declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 >)
206
+ declare <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx)
0 commit comments