@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
1315
1315
ret void
1316
1316
}
1317
1317
1318
+ ; --------------------------------------------------------------------
1319
+ ; llvm.amdgcn.mfma.f32.16x16x32.bf16
1320
+ ; --------------------------------------------------------------------
1321
+
1322
+ declare <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat>, <8 x bfloat>, <4 x float >, i32 immarg, i32 immarg, i32 immarg)
1323
+
1324
+ define <4 x float > @test_mfma_f32_16x16x32_bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) {
1325
+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
1326
+ ; SDAG: ; %bb.0:
1327
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1329
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1330
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1331
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1332
+ ; SDAG-NEXT: s_nop 1
1333
+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1334
+ ; SDAG-NEXT: s_nop 6
1335
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1336
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1337
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1338
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1339
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1340
+ ;
1341
+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
1342
+ ; GISEL: ; %bb.0:
1343
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1344
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
1345
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
1346
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1347
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1348
+ ; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1349
+ ; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1350
+ ; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1351
+ ; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1352
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
1353
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
1354
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
1355
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
1356
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1357
+ ; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1358
+ ; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1359
+ ; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1360
+ ; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1361
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1362
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1363
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1364
+ ; GISEL-NEXT: s_nop 1
1365
+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1366
+ ; GISEL-NEXT: s_nop 6
1367
+ ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1368
+ ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1369
+ ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1370
+ ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1371
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1372
+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 0 )
1373
+ ret <4 x float > %result
1374
+ }
1375
+
1376
+ define <4 x float > @test_mfma_f32_16x16x32_bf16__flags (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) {
1377
+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
1378
+ ; SDAG: ; %bb.0:
1379
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1380
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1381
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1382
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1383
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1384
+ ; SDAG-NEXT: s_nop 1
1385
+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
1386
+ ; SDAG-NEXT: s_nop 6
1387
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1388
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1389
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1390
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1391
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1392
+ ;
1393
+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
1394
+ ; GISEL: ; %bb.0:
1395
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1396
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
1397
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
1398
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1399
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1400
+ ; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1401
+ ; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1402
+ ; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1403
+ ; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1404
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
1405
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
1406
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
1407
+ ; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
1408
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1409
+ ; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1410
+ ; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1411
+ ; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1412
+ ; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
1413
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1414
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1415
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1416
+ ; GISEL-NEXT: s_nop 1
1417
+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
1418
+ ; GISEL-NEXT: s_nop 6
1419
+ ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1420
+ ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1421
+ ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1422
+ ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1423
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1424
+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 1 , i32 1 , i32 1 )
1425
+ ret <4 x float > %result
1426
+ }
1427
+
1428
+ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd (ptr addrspace (1 ) %out , <8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) #0 {
1429
+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
1430
+ ; SDAG: ; %bb.0:
1431
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1432
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1433
+ ; SDAG-NEXT: v_mov_b32_e32 v12, 0
1434
+ ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1435
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1436
+ ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1437
+ ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1438
+ ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1439
+ ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1440
+ ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1441
+ ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1442
+ ; SDAG-NEXT: s_nop 1
1443
+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
1444
+ ; SDAG-NEXT: s_nop 6
1445
+ ; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
1446
+ ; SDAG-NEXT: s_endpgm
1447
+ ;
1448
+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
1449
+ ; GISEL: ; %bb.0:
1450
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1451
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1452
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1453
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1454
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1455
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1456
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1457
+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1458
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1459
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1460
+ ; GISEL-NEXT: s_nop 1
1461
+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
1462
+ ; GISEL-NEXT: v_mov_b32_e32 v4, 0
1463
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1464
+ ; GISEL-NEXT: s_nop 4
1465
+ ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1466
+ ; GISEL-NEXT: s_endpgm
1467
+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 0 )
1468
+ store <4 x float > %result , ptr addrspace (1 ) %out
1469
+ ret void
1470
+ }
1471
+
1472
+ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags (ptr addrspace (1 ) %out , <8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 ) #0 {
1473
+ ; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
1474
+ ; SDAG: ; %bb.0:
1475
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1476
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1477
+ ; SDAG-NEXT: v_mov_b32_e32 v12, 0
1478
+ ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1479
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1480
+ ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1481
+ ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1482
+ ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1483
+ ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1484
+ ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1485
+ ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1486
+ ; SDAG-NEXT: s_nop 1
1487
+ ; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
1488
+ ; SDAG-NEXT: s_nop 6
1489
+ ; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
1490
+ ; SDAG-NEXT: s_endpgm
1491
+ ;
1492
+ ; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
1493
+ ; GISEL: ; %bb.0:
1494
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1495
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1496
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1497
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
1498
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
1499
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
1500
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
1501
+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1502
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
1503
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
1504
+ ; GISEL-NEXT: s_nop 1
1505
+ ; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
1506
+ ; GISEL-NEXT: v_mov_b32_e32 v4, 0
1507
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1508
+ ; GISEL-NEXT: s_nop 4
1509
+ ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1510
+ ; GISEL-NEXT: s_endpgm
1511
+ %result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 3 , i32 2 , i32 1 )
1512
+ store <4 x float > %result , ptr addrspace (1 ) %out
1513
+ ret void
1514
+ }
1515
+
1318
1516
attributes #0 = { "amdgpu-flat-work-group-size" ="512,512" }
1319
1517
attributes #1 = { "amdgpu-flat-work-group-size" ="1,64" }
0 commit comments