@@ -111,75 +111,65 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
111
111
define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v (<2 x double > %src ) {
112
112
; GFX-942-LABEL: v_test_cvt_v2f64_v2bf16_v:
113
113
; GFX-942: ; %bb.0:
114
- ; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, | v[0:1]|
114
+ ; GFX-942-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
115
115
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
116
116
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
117
- ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
118
- ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], | v[0:1]| , v[4:5]
119
- ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc , 1, v7
117
+ ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
118
+ ; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
119
+ ; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1] , 1, v7
120
120
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
121
121
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
122
- ; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
122
+ ; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1]
123
123
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
124
- ; GFX-942-NEXT: s_brev_b32 s4, 1
125
- ; GFX-942-NEXT: v_and_or_b32 v5, v1, s4, v4
126
- ; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
127
- ; GFX-942-NEXT: s_movk_i32 s5, 0x7fff
128
- ; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s5
129
- ; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5
124
+ ; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
125
+ ; GFX-942-NEXT: s_movk_i32 s4, 0x7fff
126
+ ; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s4
127
+ ; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4
130
128
; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
131
129
; GFX-942-NEXT: s_nop 1
132
- ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v5 , vcc
133
- ; GFX-942-NEXT: v_cvt_f32_f64_e64 v5, | v[2:3]|
130
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v5, v4 , vcc
131
+ ; GFX-942-NEXT: v_cvt_f32_f64_e32 v5, v[2:3]
134
132
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
135
133
; GFX-942-NEXT: v_and_b32_e32 v6, 1, v5
136
- ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
137
- ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], | v[2:3]| , v[0:1]
138
- ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc , 1, v6
134
+ ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, | v[0:1]|
135
+ ; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1]
136
+ ; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1] , 1, v6
139
137
; GFX-942-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
140
138
; GFX-942-NEXT: v_add_u32_e32 v0, v5, v0
141
- ; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
139
+ ; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1]
142
140
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
143
- ; GFX-942-NEXT: v_and_or_b32 v1, v3, s4, v0
144
- ; GFX-942-NEXT: v_bfe_u32 v0, v0, 16, 1
145
- ; GFX-942-NEXT: v_add3_u32 v0, v0, v1, s5
146
- ; GFX-942-NEXT: v_or_b32_e32 v1, 0x400000, v1
141
+ ; GFX-942-NEXT: v_bfe_u32 v1, v0, 16, 1
142
+ ; GFX-942-NEXT: v_add3_u32 v1, v1, v0, s4
143
+ ; GFX-942-NEXT: v_or_b32_e32 v0, 0x400000, v0
147
144
; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
148
145
; GFX-942-NEXT: s_mov_b32 s0, 0x7060302
149
146
; GFX-942-NEXT: s_nop 0
150
- ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
147
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v1, v0 , vcc
151
148
; GFX-942-NEXT: v_perm_b32 v0, v0, v4, s0
152
149
; GFX-942-NEXT: ; return to shader part epilog
153
150
;
154
151
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
155
152
; GFX-950: ; %bb.0:
156
- ; GFX-950-NEXT: v_mov_b32_e32 v4, v3
157
- ; GFX-950-NEXT: v_and_b32_e32 v3, 0x7fffffff, v4
158
- ; GFX-950-NEXT: v_mov_b32_e32 v5, v1
159
- ; GFX-950-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
160
- ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
161
- ; GFX-950-NEXT: v_and_b32_e32 v8, 1, v1
162
- ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], v[2:3], v[6:7]
163
- ; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[6:7]
164
- ; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v8
153
+ ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
154
+ ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
155
+ ; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
156
+ ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
157
+ ; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
158
+ ; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
165
159
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
166
- ; GFX-950-NEXT: v_add_u32_e32 v2, v1 , v2
160
+ ; GFX-950-NEXT: v_add_u32_e32 v2, v6 , v2
167
161
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
168
- ; GFX-950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
169
- ; GFX-950-NEXT: s_brev_b32 s4, 1
170
- ; GFX-950-NEXT: v_and_or_b32 v4, v4, s4, v1
171
- ; GFX-950-NEXT: v_and_b32_e32 v1, 0x7fffffff, v5
172
- ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
173
- ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v6
174
- ; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
175
- ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], v[0:1], v[2:3]
162
+ ; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
163
+ ; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
164
+ ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
165
+ ; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
166
+ ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
176
167
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
177
- ; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
168
+ ; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
178
169
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
179
- ; GFX-950-NEXT: v_add_u32_e32 v0, v6 , v0
170
+ ; GFX-950-NEXT: v_add_u32_e32 v0, v5 , v0
180
171
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
181
- ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
182
- ; GFX-950-NEXT: v_and_or_b32 v0, v5, s4, v0
172
+ ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
183
173
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
184
174
; GFX-950-NEXT: ; return to shader part epilog
185
175
%res = fptrunc <2 x double > %src to <2 x bfloat>
@@ -348,42 +338,38 @@ entry:
348
338
define amdgpu_ps void @fptrunc_f64_to_bf16 (double %a , ptr %out ) {
349
339
; GFX-942-LABEL: fptrunc_f64_to_bf16:
350
340
; GFX-942: ; %bb.0: ; %entry
351
- ; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, | v[0:1]|
341
+ ; GFX-942-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
352
342
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
353
343
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
354
- ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
355
- ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], | v[0:1]| , v[4:5]
356
- ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc , 1, v7
344
+ ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
345
+ ; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
346
+ ; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1] , 1, v7
357
347
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
358
348
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
359
- ; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
349
+ ; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1]
360
350
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
361
- ; GFX-942-NEXT: s_brev_b32 s0, 1
362
- ; GFX-942-NEXT: v_and_or_b32 v5, v1, s0, v4
363
- ; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
351
+ ; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
364
352
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
365
- ; GFX-942-NEXT: v_add3_u32 v4, v4, v5 , s0
366
- ; GFX-942-NEXT: v_or_b32_e32 v5 , 0x400000, v5
353
+ ; GFX-942-NEXT: v_add3_u32 v5, v5, v4 , s0
354
+ ; GFX-942-NEXT: v_or_b32_e32 v4 , 0x400000, v4
367
355
; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
368
356
; GFX-942-NEXT: s_nop 1
369
- ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5 , vcc
357
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v5, v4 , vcc
370
358
; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0
371
359
; GFX-942-NEXT: s_endpgm
372
360
;
373
361
; GFX-950-LABEL: fptrunc_f64_to_bf16:
374
362
; GFX-950: ; %bb.0: ; %entry
375
- ; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, | v[0:1]|
363
+ ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
376
364
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
377
365
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
378
- ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
379
- ; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], | v[0:1]| , v[4:5]
380
- ; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc , 1, v7
366
+ ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
367
+ ; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
368
+ ; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1] , 1, v7
381
369
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
382
370
; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
383
- ; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
371
+ ; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
384
372
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
385
- ; GFX-950-NEXT: s_brev_b32 s0, 1
386
- ; GFX-950-NEXT: v_and_or_b32 v0, v1, s0, v0
387
373
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
388
374
; GFX-950-NEXT: flat_store_short v[2:3], v0
389
375
; GFX-950-NEXT: s_endpgm
@@ -396,44 +382,38 @@ entry:
396
382
define amdgpu_ps void @fptrunc_f64_to_bf16_neg (double %a , ptr %out ) {
397
383
; GFX-942-LABEL: fptrunc_f64_to_bf16_neg:
398
384
; GFX-942: ; %bb.0: ; %entry
399
- ; GFX-942-NEXT: v_cvt_f32_f64_e64 v7, | v[0:1]|
400
- ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
401
- ; GFX-942-NEXT: v_and_b32_e32 v8 , 1, v7
402
- ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
403
- ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], | v[0:1]| , v[4:5]
404
- ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
385
+ ; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, - v[0:1]
386
+ ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
387
+ ; GFX-942-NEXT: v_and_b32_e32 v7 , 1, v6
388
+ ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
389
+ ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], - v[0:1], v[4:5]
390
+ ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
405
391
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
406
- ; GFX-942-NEXT: v_add_u32_e32 v4, v7 , v4
392
+ ; GFX-942-NEXT: v_add_u32_e32 v4, v6 , v4
407
393
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
408
- ; GFX-942-NEXT: s_brev_b32 s4, 1
409
- ; GFX-942-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
410
- ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
411
- ; GFX-942-NEXT: v_and_or_b32 v5, v6, s4, v4
412
- ; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
394
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
395
+ ; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
413
396
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
414
- ; GFX-942-NEXT: v_add3_u32 v4, v4, v5 , s0
415
- ; GFX-942-NEXT: v_or_b32_e32 v5 , 0x400000, v5
397
+ ; GFX-942-NEXT: v_add3_u32 v5, v5, v4 , s0
398
+ ; GFX-942-NEXT: v_or_b32_e32 v4 , 0x400000, v4
416
399
; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
417
400
; GFX-942-NEXT: s_nop 1
418
- ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5 , vcc
401
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v5, v4 , vcc
419
402
; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0
420
403
; GFX-942-NEXT: s_endpgm
421
404
;
422
405
; GFX-950-LABEL: fptrunc_f64_to_bf16_neg:
423
406
; GFX-950: ; %bb.0: ; %entry
424
- ; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, | v[0:1]|
425
- ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
426
- ; GFX-950-NEXT: v_and_b32_e32 v8 , 1, v7
427
- ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
428
- ; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], | v[0:1]| , v[4:5]
429
- ; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
407
+ ; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, - v[0:1]
408
+ ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
409
+ ; GFX-950-NEXT: v_and_b32_e32 v7 , 1, v6
410
+ ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
411
+ ; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], - v[0:1], v[4:5]
412
+ ; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
430
413
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
431
- ; GFX-950-NEXT: v_add_u32_e32 v0, v7 , v0
414
+ ; GFX-950-NEXT: v_add_u32_e32 v0, v6 , v0
432
415
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
433
- ; GFX-950-NEXT: s_brev_b32 s4, 1
434
- ; GFX-950-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
435
- ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
436
- ; GFX-950-NEXT: v_and_or_b32 v0, v6, s4, v0
416
+ ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
437
417
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
438
418
; GFX-950-NEXT: flat_store_short v[2:3], v0
439
419
; GFX-950-NEXT: s_endpgm
@@ -447,44 +427,38 @@ entry:
447
427
define amdgpu_ps void @fptrunc_f64_to_bf16_abs (double %a , ptr %out ) {
448
428
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
449
429
; GFX-942: ; %bb.0: ; %entry
450
- ; GFX-942-NEXT: v_cvt_f32_f64_e64 v7 , |v[0:1]|
451
- ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
452
- ; GFX-942-NEXT: v_and_b32_e32 v8 , 1, v7
453
- ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
430
+ ; GFX-942-NEXT: v_cvt_f32_f64_e64 v6 , |v[0:1]|
431
+ ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
432
+ ; GFX-942-NEXT: v_and_b32_e32 v7 , 1, v6
433
+ ; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
454
434
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
455
- ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
435
+ ; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
456
436
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
457
- ; GFX-942-NEXT: v_add_u32_e32 v4, v7 , v4
437
+ ; GFX-942-NEXT: v_add_u32_e32 v4, v6 , v4
458
438
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
459
- ; GFX-942-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
460
- ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
461
- ; GFX-942-NEXT: s_brev_b32 s0, 1
462
- ; GFX-942-NEXT: v_and_or_b32 v5, v6, s0, v4
463
- ; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
439
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
440
+ ; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
464
441
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
465
- ; GFX-942-NEXT: v_add3_u32 v4, v4, v5 , s0
466
- ; GFX-942-NEXT: v_or_b32_e32 v5 , 0x400000, v5
442
+ ; GFX-942-NEXT: v_add3_u32 v5, v5, v4 , s0
443
+ ; GFX-942-NEXT: v_or_b32_e32 v4 , 0x400000, v4
467
444
; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
468
445
; GFX-942-NEXT: s_nop 1
469
- ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5 , vcc
446
+ ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v5, v4 , vcc
470
447
; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0
471
448
; GFX-942-NEXT: s_endpgm
472
449
;
473
450
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
474
451
; GFX-950: ; %bb.0: ; %entry
475
- ; GFX-950-NEXT: v_cvt_f32_f64_e64 v7 , |v[0:1]|
476
- ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
477
- ; GFX-950-NEXT: v_and_b32_e32 v8 , 1, v7
478
- ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
452
+ ; GFX-950-NEXT: v_cvt_f32_f64_e64 v6 , |v[0:1]|
453
+ ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
454
+ ; GFX-950-NEXT: v_and_b32_e32 v7 , 1, v6
455
+ ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, | v[4:5]|
479
456
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
480
- ; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
457
+ ; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
481
458
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
482
- ; GFX-950-NEXT: v_add_u32_e32 v0, v7 , v0
459
+ ; GFX-950-NEXT: v_add_u32_e32 v0, v6 , v0
483
460
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
484
- ; GFX-950-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
485
- ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
486
- ; GFX-950-NEXT: s_brev_b32 s0, 1
487
- ; GFX-950-NEXT: v_and_or_b32 v0, v6, s0, v0
461
+ ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
488
462
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
489
463
; GFX-950-NEXT: flat_store_short v[2:3], v0
490
464
; GFX-950-NEXT: s_endpgm
0 commit comments