@@ -16,13 +16,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
16
16
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
17
17
; GCN-NEXT: s_mov_b64 s[36:37], 1
18
18
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
19
- ; GCN-NEXT: s_mov_b32 s36 , 2
20
- ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
19
+ ; GCN-NEXT: s_mov_b32 s38 , 2
20
+ ; GCN-NEXT: s_mov_b32 s39, s37
21
21
; GCN-NEXT: s_waitcnt lgkmcnt(0)
22
22
; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
23
23
; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
24
+ ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
24
25
; GCN-NEXT: s_waitcnt lgkmcnt(0)
25
26
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
27
+ ; GCN-NEXT: v_accvgpr_write_b32 a16, s16
26
28
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
27
29
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
28
30
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
@@ -38,7 +40,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
38
40
; GCN-NEXT: v_accvgpr_write_b32 a13, s13
39
41
; GCN-NEXT: v_accvgpr_write_b32 a14, s14
40
42
; GCN-NEXT: v_accvgpr_write_b32 a15, s15
41
- ; GCN-NEXT: v_accvgpr_write_b32 a16, s16
42
43
; GCN-NEXT: v_accvgpr_write_b32 a17, s17
43
44
; GCN-NEXT: v_accvgpr_write_b32 a18, s18
44
45
; GCN-NEXT: v_accvgpr_write_b32 a19, s19
@@ -317,31 +318,29 @@ bb:
317
318
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm (ptr addrspace (1 ) %arg , double %a , double %b ) #0 {
318
319
; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
319
320
; GCN: ; %bb.0: ; %bb
320
- ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
321
- ; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
322
- ; GCN-NEXT: s_mov_b64 s[0:1], 0
321
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
322
+ ; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34
323
323
; GCN-NEXT: s_mov_b64 s[6:7], 1.0
324
- ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
324
+ ; GCN-NEXT: s_mov_b64 s[8:9], 0
325
+ ; GCN-NEXT: v_accvgpr_write_b32 a0, s8
325
326
; GCN-NEXT: s_waitcnt lgkmcnt(0)
326
- ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
327
- ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
328
- ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
329
- ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
330
- ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
331
- ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
332
- ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
333
- ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
327
+ ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
328
+ ; GCN-NEXT: v_accvgpr_write_b32 a2, s8
329
+ ; GCN-NEXT: v_accvgpr_write_b32 a4, s8
334
330
; GCN-NEXT: v_accvgpr_write_b32 a6, s6
331
+ ; GCN-NEXT: v_accvgpr_write_b32 a1, s9
332
+ ; GCN-NEXT: v_accvgpr_write_b32 a3, s9
333
+ ; GCN-NEXT: v_accvgpr_write_b32 a5, s9
335
334
; GCN-NEXT: v_accvgpr_write_b32 a7, s7
336
- ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13 ], s[12:13 ] op_sel:[0,1]
335
+ ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11 ], s[10:11 ] op_sel:[0,1]
337
336
; GCN-NEXT: s_nop 1
338
337
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
339
338
; GCN-NEXT: v_mov_b32_e32 v0, 0
340
339
; GCN-NEXT: s_nop 7
341
340
; GCN-NEXT: s_nop 7
342
341
; GCN-NEXT: s_nop 0
343
- ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9 ]
344
- ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9 ] offset:16
342
+ ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1 ]
343
+ ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1 ] offset:16
345
344
; GCN-NEXT: s_endpgm
346
345
bb:
347
346
%mai.1 = tail call <4 x double > @llvm.amdgcn.mfma.f64.16x16x4f64 (double %a , double %b , <4 x double > <double 0 .0 , double 0 .0 , double 0 .0 , double 1 .0 >, i32 0 , i32 0 , i32 0 )
@@ -352,32 +351,29 @@ bb:
352
351
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit (ptr addrspace (1 ) %arg , double %a , double %b ) #0 {
353
352
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
354
353
; GCN: ; %bb.0: ; %bb
355
- ; GCN-NEXT: s_load_dwordx4 s[8:11 ], s[4:5], 0x24
356
- ; GCN-NEXT: s_load_dwordx2 s[12:13 ], s[4:5], 0x34
357
- ; GCN-NEXT: s_mov_b32 s0 , 0
358
- ; GCN-NEXT: s_mov_b32 s1 , 0x405ec000
359
- ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
354
+ ; GCN-NEXT: s_load_dwordx4 s[0:3 ], s[4:5], 0x24
355
+ ; GCN-NEXT: s_load_dwordx2 s[8:9 ], s[4:5], 0x34
356
+ ; GCN-NEXT: s_mov_b32 s6 , 0
357
+ ; GCN-NEXT: s_mov_b32 s7 , 0x405ec000
358
+ ; GCN-NEXT: v_accvgpr_write_b32 a0, s6
360
359
; GCN-NEXT: s_waitcnt lgkmcnt(0)
361
- ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
362
- ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
363
- ; GCN-NEXT: s_mov_b64 s[6:7], s[0:1]
364
- ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
365
- ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
366
- ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
367
- ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
368
- ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
369
- ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
360
+ ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
361
+ ; GCN-NEXT: v_accvgpr_write_b32 a2, s6
362
+ ; GCN-NEXT: v_accvgpr_write_b32 a4, s6
370
363
; GCN-NEXT: v_accvgpr_write_b32 a6, s6
364
+ ; GCN-NEXT: v_accvgpr_write_b32 a1, s7
365
+ ; GCN-NEXT: v_accvgpr_write_b32 a3, s7
366
+ ; GCN-NEXT: v_accvgpr_write_b32 a5, s7
371
367
; GCN-NEXT: v_accvgpr_write_b32 a7, s7
372
- ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13 ], s[12:13 ] op_sel:[0,1]
368
+ ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9 ], s[8:9 ] op_sel:[0,1]
373
369
; GCN-NEXT: s_nop 1
374
370
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
375
371
; GCN-NEXT: v_mov_b32_e32 v0, 0
376
372
; GCN-NEXT: s_nop 7
377
373
; GCN-NEXT: s_nop 7
378
374
; GCN-NEXT: s_nop 0
379
- ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9 ]
380
- ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9 ] offset:16
375
+ ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1 ]
376
+ ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1 ] offset:16
381
377
; GCN-NEXT: s_endpgm
382
378
bb:
383
379
%mai.1 = tail call <4 x double > @llvm.amdgcn.mfma.f64.16x16x4f64 (double %a , double %b , <4 x double > <double 123 .0 , double 123 .0 , double 123 .0 , double 123 .0 >, i32 0 , i32 0 , i32 0 )
0 commit comments