@@ -1678,7 +1678,8 @@ void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, fl
1678
1678
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1679
1679
// CHECK-NEXT: ret void
1680
1680
//
1681
- void test_bitop3_b32 (global uint * out , uint a , uint b , uint c ) {
1681
+ void test_bitop3_b32 (global uint * out , uint a , uint b , uint c )
1682
+ {
1682
1683
* out = __builtin_amdgcn_bitop3_b32 (a , b , c , 1 );
1683
1684
}
1684
1685
@@ -1700,6 +1701,67 @@ void test_bitop3_b32(global uint* out, uint a, uint b, uint c) {
1700
1701
// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
1701
1702
// CHECK-NEXT: ret void
1702
1703
//
1703
- void test_bitop3_b16 (global ushort * out , ushort a , ushort b , ushort c ) {
1704
+ void test_bitop3_b16 (global ushort * out , ushort a , ushort b , ushort c )
1705
+ {
1704
1706
* out = __builtin_amdgcn_bitop3_b16 (a , b , c , 1 );
1705
1707
}
1708
+
1709
+ // CHECK-LABEL: @test_cvt_sr_bf16_f32(
1710
+ // CHECK-NEXT: entry:
1711
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1712
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1713
+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1714
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1715
+ // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1716
+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1717
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1718
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP0]], align 4
1719
+ // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1720
+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1721
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false)
1722
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1723
+ // CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1724
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1725
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP6]], align 4
1726
+ // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1727
+ // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1728
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true)
1729
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1730
+ // CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1731
+ // CHECK-NEXT: ret void
1732
+ //
1733
+ void test_cvt_sr_bf16_f32 (global bfloat2 * out , float src , uint seed )
1734
+ {
1735
+ * out = __builtin_amdgcn_cvt_sr_bf16_f32 (* out , src , seed , 0 );
1736
+ * out = __builtin_amdgcn_cvt_sr_bf16_f32 (* out , src , seed , 1 );
1737
+ }
1738
+
1739
+ // CHECK-LABEL: @test_cvt_sr_f16_f32(
1740
+ // CHECK-NEXT: entry:
1741
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1742
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1743
+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1744
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1745
+ // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1746
+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1747
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1748
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
1749
+ // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1750
+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1751
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false)
1752
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1753
+ // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1754
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1755
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
1756
+ // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1757
+ // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1758
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true)
1759
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1760
+ // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1761
+ // CHECK-NEXT: ret void
1762
+ //
1763
+ void test_cvt_sr_f16_f32 (global half2 * out , float src , uint seed )
1764
+ {
1765
+ * out = __builtin_amdgcn_cvt_sr_f16_f32 (* out , src , seed , 0 );
1766
+ * out = __builtin_amdgcn_cvt_sr_f16_f32 (* out , src , seed , 1 );
1767
+ }
0 commit comments