@@ -1536,8 +1536,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
1536
1536
; SI-NEXT: s_or_b32 s2, s5, s2
1537
1537
; SI-NEXT: s_cmp_lg_u32 s2, 0
1538
1538
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1539
- ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1540
- ; SI-NEXT: v_readfirstlane_b32 s2, v1
1539
+ ; SI-NEXT: s_bfe_u32 s2, s4, 0x10000
1541
1540
; SI-NEXT: s_bfe_u32 s5, s3, 0xb0014
1542
1541
; SI-NEXT: s_or_b32 s2, s6, s2
1543
1542
; SI-NEXT: s_sub_i32 s6, 0x3f1, s5
@@ -1599,8 +1598,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
1599
1598
; VI-NEXT: s_or_b32 s0, s1, s6
1600
1599
; VI-NEXT: s_cmp_lg_u32 s0, 0
1601
1600
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1602
- ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1603
- ; VI-NEXT: v_readfirstlane_b32 s0, v2
1601
+ ; VI-NEXT: s_bfe_u32 s0, s0, 0x10000
1604
1602
; VI-NEXT: s_bfe_u32 s1, s7, 0xb0014
1605
1603
; VI-NEXT: v_mov_b32_e32 v0, s4
1606
1604
; VI-NEXT: s_or_b32 s4, s2, s0
@@ -1661,8 +1659,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
1661
1659
; GFX9-NEXT: s_or_b32 s0, s1, s6
1662
1660
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
1663
1661
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
1664
- ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1665
- ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
1662
+ ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x10000
1666
1663
; GFX9-NEXT: s_bfe_u32 s1, s7, 0xb0014
1667
1664
; GFX9-NEXT: s_or_b32 s6, s2, s0
1668
1665
; GFX9-NEXT: s_sub_i32 s2, 0x3f1, s1
@@ -1714,63 +1711,61 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
1714
1711
; GFX11-NEXT: s_clause 0x1
1715
1712
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1716
1713
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
1714
+ ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1717
1715
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1718
1716
; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff
1719
1717
; GFX11-NEXT: s_lshr_b32 s2, s7, 8
1720
1718
; GFX11-NEXT: s_or_b32 s1, s1, s6
1721
1719
; GFX11-NEXT: s_and_b32 s2, s2, 0xffe
1722
1720
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
1723
1721
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
1724
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1725
- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
1726
- ; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014
1727
- ; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1
1728
- ; GFX11-NEXT: s_addk_i32 s1, 0xfc10
1729
- ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
1730
- ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1731
- ; GFX11-NEXT: s_lshl_b32 s8, s1, 12
1732
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1733
- ; GFX11-NEXT: v_readfirstlane_b32 s6, v1
1734
- ; GFX11-NEXT: s_or_b32 s2, s2, s3
1735
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1736
- ; GFX11-NEXT: s_or_b32 s3, s2, 0x1000
1737
- ; GFX11-NEXT: s_or_b32 s8, s2, s8
1738
- ; GFX11-NEXT: s_lshr_b32 s6, s3, s6
1739
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1740
- ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6
1741
- ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1742
- ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0
1722
+ ; GFX11-NEXT: s_bfe_u32 s3, s7, 0xb0014
1723
+ ; GFX11-NEXT: s_bfe_u32 s1, s1, 0x10000
1724
+ ; GFX11-NEXT: s_sub_i32 s6, 0x3f1, s3
1725
+ ; GFX11-NEXT: s_or_b32 s1, s2, s1
1726
+ ; GFX11-NEXT: v_med3_i32 v0, s6, 0, 13
1727
+ ; GFX11-NEXT: s_or_b32 s2, s1, 0x1000
1728
+ ; GFX11-NEXT: s_addk_i32 s3, 0xfc10
1729
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1730
+ ; GFX11-NEXT: s_lshl_b32 s8, s3, 12
1731
+ ; GFX11-NEXT: v_readfirstlane_b32 s6, v0
1732
+ ; GFX11-NEXT: s_or_b32 s8, s1, s8
1733
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1734
+ ; GFX11-NEXT: s_lshr_b32 s6, s2, s6
1735
+ ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s6
1736
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1737
+ ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
1743
1738
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1744
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1745
- ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1746
- ; GFX11-NEXT: s_or_b32 s3 , s6, s3
1747
- ; GFX11-NEXT: s_cmp_lt_i32 s1 , 1
1748
- ; GFX11-NEXT: s_cselect_b32 s3, s3 , s8
1749
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1750
- ; GFX11-NEXT: s_and_b32 s6, s3, 7
1739
+ ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
1740
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1741
+ ; GFX11-NEXT: s_or_b32 s2 , s6, s2
1742
+ ; GFX11-NEXT: s_cmp_lt_i32 s3 , 1
1743
+ ; GFX11-NEXT: s_cselect_b32 s2, s2 , s8
1744
+ ; GFX11-NEXT: s_and_b32 s6, s2, 7
1745
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1751
1746
; GFX11-NEXT: s_cmp_gt_i32 s6, 5
1752
1747
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
1753
1748
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
1754
1749
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
1755
- ; GFX11-NEXT: s_lshr_b32 s3, s3 , 2
1750
+ ; GFX11-NEXT: s_lshr_b32 s2, s2 , 2
1756
1751
; GFX11-NEXT: s_or_b32 s6, s6, s8
1757
1752
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1758
1753
; GFX11-NEXT: s_cmp_lg_u32 s6, 0
1759
- ; GFX11-NEXT: s_addc_u32 s3, s3 , 0
1760
- ; GFX11-NEXT: s_cmp_lt_i32 s1 , 31
1761
- ; GFX11-NEXT: s_cselect_b32 s3, s3 , 0x7c00
1762
- ; GFX11-NEXT: s_cmp_lg_u32 s2 , 0
1763
- ; GFX11-NEXT: s_cselect_b32 s2 , -1, 0
1764
- ; GFX11-NEXT: s_cmpk_eq_i32 s1 , 0x40f
1765
- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
1754
+ ; GFX11-NEXT: s_addc_u32 s2, s2 , 0
1755
+ ; GFX11-NEXT: s_cmp_lt_i32 s3 , 31
1756
+ ; GFX11-NEXT: s_cselect_b32 s2, s2 , 0x7c00
1757
+ ; GFX11-NEXT: s_cmp_lg_u32 s1 , 0
1758
+ ; GFX11-NEXT: s_cselect_b32 s1 , -1, 0
1759
+ ; GFX11-NEXT: s_cmpk_eq_i32 s3 , 0x40f
1760
+ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
1766
1761
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1767
1762
; GFX11-NEXT: s_lshr_b32 s1, s7, 16
1768
1763
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1769
1764
; GFX11-NEXT: s_and_b32 s1, s1, 0x8000
1770
1765
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0
1771
1766
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1772
1767
; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0
1773
- ; GFX11-NEXT: v_cndmask_b32_e32 v0, s3 , v0, vcc_lo
1768
+ ; GFX11-NEXT: v_cndmask_b32_e32 v0, s2 , v0, vcc_lo
1774
1769
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1775
1770
; GFX11-NEXT: v_or_b32_e32 v0, s1, v0
1776
1771
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
0 commit comments