@@ -29,10 +29,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
29
29
; SI-NEXT: s_mov_b32 s0, s4
30
30
; SI-NEXT: s_mov_b32 s1, s5
31
31
; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
32
- ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
33
- ; SI-NEXT: v_mov_b32_e32 v1, s11
34
- ; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
35
- ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
32
+ ; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
33
+ ; SI-NEXT: s_cselect_b64 s[4:5], 1, 0
34
+ ; SI-NEXT: s_add_u32 s4, s10, s4
35
+ ; SI-NEXT: s_addc_u32 s5, s11, s5
36
+ ; SI-NEXT: v_mov_b32_e32 v0, s4
37
+ ; SI-NEXT: v_mov_b32_e32 v1, s5
36
38
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
37
39
; SI-NEXT: s_endpgm
38
40
;
@@ -45,15 +47,17 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
45
47
; VI-NEXT: s_add_u32 s2, s6, s0
46
48
; VI-NEXT: v_mov_b32_e32 v2, s7
47
49
; VI-NEXT: s_addc_u32 s3, s7, s1
48
- ; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
49
50
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
50
- ; VI-NEXT: v_mov_b32_e32 v3, s3
51
- ; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
52
- ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
53
- ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
51
+ ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
54
52
; VI-NEXT: v_mov_b32_e32 v0, s4
53
+ ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
54
+ ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
55
+ ; VI-NEXT: s_cselect_b64 s[0:1], 1, 0
56
+ ; VI-NEXT: s_add_u32 s0, s2, s0
57
+ ; VI-NEXT: s_addc_u32 s1, s3, s1
58
+ ; VI-NEXT: v_mov_b32_e32 v3, s1
55
59
; VI-NEXT: v_mov_b32_e32 v1, s5
56
- ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
60
+ ; VI-NEXT: v_mov_b32_e32 v2, s0
57
61
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
58
62
; VI-NEXT: s_endpgm
59
63
;
@@ -67,13 +71,15 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
67
71
; GFX9-NEXT: s_add_u32 s0, s6, s2
68
72
; GFX9-NEXT: v_mov_b32_e32 v1, s7
69
73
; GFX9-NEXT: s_addc_u32 s1, s7, s3
70
- ; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
71
74
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
75
+ ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
76
+ ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
77
+ ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
78
+ ; GFX9-NEXT: s_cselect_b64 s[2:3], 1, 0
79
+ ; GFX9-NEXT: s_add_u32 s0, s0, s2
80
+ ; GFX9-NEXT: s_addc_u32 s1, s1, s3
81
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
72
82
; GFX9-NEXT: v_mov_b32_e32 v1, s1
73
- ; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
74
- ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
75
- ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
76
- ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
77
83
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
78
84
; GFX9-NEXT: s_endpgm
79
85
;
@@ -87,11 +93,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
87
93
; GFX10-NEXT: s_add_u32 s0, s6, s2
88
94
; GFX10-NEXT: s_addc_u32 s1, s7, s3
89
95
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
90
- ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
91
- ; GFX10-NEXT: s_xor_b32 s2, s2, s3
92
- ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
93
- ; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
94
- ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
96
+ ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
97
+ ; GFX10-NEXT: s_xor_b32 s2, s2, s6
98
+ ; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
99
+ ; GFX10-NEXT: s_cselect_b64 s[2:3], 1, 0
100
+ ; GFX10-NEXT: s_add_u32 s0, s0, s2
101
+ ; GFX10-NEXT: s_addc_u32 s1, s1, s3
102
+ ; GFX10-NEXT: v_mov_b32_e32 v0, s0
103
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s1
95
104
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
96
105
; GFX10-NEXT: s_endpgm
97
106
;
@@ -100,18 +109,20 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
100
109
; GFX11-NEXT: s_clause 0x1
101
110
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
102
111
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
103
- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
104
112
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
105
113
; GFX11-NEXT: s_add_u32 s2, s6, s0
106
114
; GFX11-NEXT: s_addc_u32 s3, s7, s1
107
115
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
108
- ; GFX11-NEXT: v_cmp_lt_i64_e64 s1 , s[2:3], s[6:7]
116
+ ; GFX11-NEXT: v_cmp_lt_i64_e64 s6 , s[2:3], s[6:7]
109
117
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
110
- ; GFX11-NEXT: s_xor_b32 s0, s0, s1
111
- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
112
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
113
- ; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
114
- ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
118
+ ; GFX11-NEXT: s_xor_b32 s0, s0, s6
119
+ ; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
120
+ ; GFX11-NEXT: s_cselect_b64 s[0:1], 1, 0
121
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
122
+ ; GFX11-NEXT: s_add_u32 s0, s2, s0
123
+ ; GFX11-NEXT: s_addc_u32 s1, s3, s1
124
+ ; GFX11-NEXT: v_mov_b32_e32 v0, s0
125
+ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
115
126
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
116
127
; GFX11-NEXT: s_nop 0
117
128
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
0 commit comments