Skip to content

Commit 344a491

Browse files
authored
[CodeGen] Simplify expandRoundInexactToOdd (#134988)
FP_ROUND and FP_EXTEND the input value before FABSing it. This avoids some bit twiddling to copy the sign bit from the input to the result. It does introduce one extra FABS, but that is folded into another instruction for free on AMDGPU, which is the only target currently affected by this change.
1 parent f030f6f commit 344a491

File tree

4 files changed

+481
-571
lines changed

4 files changed

+481
-571
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11608,28 +11608,13 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op,
1160811608
// correct for this using a trick explained in: Boldo, Sylvie, and
1160911609
// Guillaume Melquiond. "When double rounding is odd." 17th IMACS
1161011610
// World Congress. 2005.
11611-
unsigned BitSize = OperandVT.getScalarSizeInBits();
11612-
EVT WideIntVT = OperandVT.changeTypeToInteger();
11613-
SDValue OpAsInt = DAG.getBitcast(WideIntVT, Op);
11614-
SDValue SignBit =
11615-
DAG.getNode(ISD::AND, dl, WideIntVT, OpAsInt,
11616-
DAG.getConstant(APInt::getSignMask(BitSize), dl, WideIntVT));
11617-
SDValue AbsWide;
11618-
if (isOperationLegalOrCustom(ISD::FABS, OperandVT)) {
11619-
AbsWide = DAG.getNode(ISD::FABS, dl, OperandVT, Op);
11620-
} else {
11621-
SDValue ClearedSign = DAG.getNode(
11622-
ISD::AND, dl, WideIntVT, OpAsInt,
11623-
DAG.getConstant(APInt::getSignedMaxValue(BitSize), dl, WideIntVT));
11624-
AbsWide = DAG.getBitcast(OperandVT, ClearedSign);
11625-
}
11626-
SDValue AbsNarrow = DAG.getFPExtendOrRound(AbsWide, dl, ResultVT);
11627-
SDValue AbsNarrowAsWide = DAG.getFPExtendOrRound(AbsNarrow, dl, OperandVT);
11611+
SDValue Narrow = DAG.getFPExtendOrRound(Op, dl, ResultVT);
11612+
SDValue NarrowAsWide = DAG.getFPExtendOrRound(Narrow, dl, OperandVT);
1162811613

1162911614
// We can keep the narrow value as-is if narrowing was exact (no
1163011615
// rounding error), the wide value was NaN (the narrow value is also
1163111616
// NaN and should be preserved) or if we rounded to the odd value.
11632-
SDValue NarrowBits = DAG.getNode(ISD::BITCAST, dl, ResultIntVT, AbsNarrow);
11617+
SDValue NarrowBits = DAG.getNode(ISD::BITCAST, dl, ResultIntVT, Narrow);
1163311618
SDValue One = DAG.getConstant(1, dl, ResultIntVT);
1163411619
SDValue NegativeOne = DAG.getAllOnesConstant(dl, ResultIntVT);
1163511620
SDValue And = DAG.getNode(ISD::AND, dl, ResultIntVT, NarrowBits, One);
@@ -11640,13 +11625,15 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op,
1164011625
SDValue AlreadyOdd = DAG.getSetCC(dl, ResultIntVTCCVT, And, Zero, ISD::SETNE);
1164111626

1164211627
EVT WideSetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
11643-
AbsWide.getValueType());
11628+
Op.getValueType());
1164411629
// We keep results which are exact, odd or NaN.
1164511630
SDValue KeepNarrow =
11646-
DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETUEQ);
11631+
DAG.getSetCC(dl, WideSetCCVT, Op, NarrowAsWide, ISD::SETUEQ);
1164711632
KeepNarrow = DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd);
1164811633
// We morally performed a round-down if AbsNarrow is smaller than
1164911634
// AbsWide.
11635+
SDValue AbsWide = DAG.getNode(ISD::FABS, dl, OperandVT, Op);
11636+
SDValue AbsNarrowAsWide = DAG.getNode(ISD::FABS, dl, OperandVT, NarrowAsWide);
1165011637
SDValue NarrowIsRd =
1165111638
DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETOGT);
1165211639
// If the narrow value is odd or exact, pick it.
@@ -11656,11 +11643,6 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op,
1165611643
SDValue Adjust = DAG.getSelect(dl, ResultIntVT, NarrowIsRd, One, NegativeOne);
1165711644
SDValue Adjusted = DAG.getNode(ISD::ADD, dl, ResultIntVT, NarrowBits, Adjust);
1165811645
Op = DAG.getSelect(dl, ResultIntVT, KeepNarrow, NarrowBits, Adjusted);
11659-
int ShiftAmount = BitSize - ResultVT.getScalarSizeInBits();
11660-
SDValue ShiftCnst = DAG.getShiftAmountConstant(ShiftAmount, WideIntVT, dl);
11661-
SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst);
11662-
SignBit = DAG.getNode(ISD::TRUNCATE, dl, ResultIntVT, SignBit);
11663-
Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Op, SignBit);
1166411646
return DAG.getNode(ISD::BITCAST, dl, ResultVT, Op);
1166511647
}
1166611648

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 86 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -111,75 +111,65 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
111111
define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
112112
; GFX-942-LABEL: v_test_cvt_v2f64_v2bf16_v:
113113
; GFX-942: ; %bb.0:
114-
; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
114+
; GFX-942-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
115115
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
116116
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
117-
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
118-
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
119-
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
117+
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
118+
; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
119+
; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
120120
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
121121
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
122-
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
122+
; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1]
123123
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
124-
; GFX-942-NEXT: s_brev_b32 s4, 1
125-
; GFX-942-NEXT: v_and_or_b32 v5, v1, s4, v4
126-
; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
127-
; GFX-942-NEXT: s_movk_i32 s5, 0x7fff
128-
; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s5
129-
; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5
124+
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
125+
; GFX-942-NEXT: s_movk_i32 s4, 0x7fff
126+
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s4
127+
; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4
130128
; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
131129
; GFX-942-NEXT: s_nop 1
132-
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
133-
; GFX-942-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
130+
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
131+
; GFX-942-NEXT: v_cvt_f32_f64_e32 v5, v[2:3]
134132
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
135133
; GFX-942-NEXT: v_and_b32_e32 v6, 1, v5
136-
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
137-
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
138-
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
134+
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[0:1]|
135+
; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1]
136+
; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
139137
; GFX-942-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
140138
; GFX-942-NEXT: v_add_u32_e32 v0, v5, v0
141-
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
139+
; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1]
142140
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
143-
; GFX-942-NEXT: v_and_or_b32 v1, v3, s4, v0
144-
; GFX-942-NEXT: v_bfe_u32 v0, v0, 16, 1
145-
; GFX-942-NEXT: v_add3_u32 v0, v0, v1, s5
146-
; GFX-942-NEXT: v_or_b32_e32 v1, 0x400000, v1
141+
; GFX-942-NEXT: v_bfe_u32 v1, v0, 16, 1
142+
; GFX-942-NEXT: v_add3_u32 v1, v1, v0, s4
143+
; GFX-942-NEXT: v_or_b32_e32 v0, 0x400000, v0
147144
; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
148145
; GFX-942-NEXT: s_mov_b32 s0, 0x7060302
149146
; GFX-942-NEXT: s_nop 0
150-
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
147+
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
151148
; GFX-942-NEXT: v_perm_b32 v0, v0, v4, s0
152149
; GFX-942-NEXT: ; return to shader part epilog
153150
;
154151
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
155152
; GFX-950: ; %bb.0:
156-
; GFX-950-NEXT: v_mov_b32_e32 v4, v3
157-
; GFX-950-NEXT: v_and_b32_e32 v3, 0x7fffffff, v4
158-
; GFX-950-NEXT: v_mov_b32_e32 v5, v1
159-
; GFX-950-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
160-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
161-
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v1
162-
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], v[2:3], v[6:7]
163-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[6:7]
164-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v8
153+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
154+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
155+
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
156+
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
157+
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
158+
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
165159
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
166-
; GFX-950-NEXT: v_add_u32_e32 v2, v1, v2
160+
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
167161
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
168-
; GFX-950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
169-
; GFX-950-NEXT: s_brev_b32 s4, 1
170-
; GFX-950-NEXT: v_and_or_b32 v4, v4, s4, v1
171-
; GFX-950-NEXT: v_and_b32_e32 v1, 0x7fffffff, v5
172-
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
173-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v6
174-
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
175-
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], v[0:1], v[2:3]
162+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
163+
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
164+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
165+
; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
166+
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
176167
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
177-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
168+
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
178169
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
179-
; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
170+
; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
180171
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
181-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
182-
; GFX-950-NEXT: v_and_or_b32 v0, v5, s4, v0
172+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
183173
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
184174
; GFX-950-NEXT: ; return to shader part epilog
185175
%res = fptrunc <2 x double> %src to <2 x bfloat>
@@ -348,42 +338,38 @@ entry:
348338
define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
349339
; GFX-942-LABEL: fptrunc_f64_to_bf16:
350340
; GFX-942: ; %bb.0: ; %entry
351-
; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
341+
; GFX-942-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
352342
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
353343
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
354-
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
355-
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
356-
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
344+
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
345+
; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
346+
; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
357347
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
358348
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
359-
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
349+
; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1]
360350
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
361-
; GFX-942-NEXT: s_brev_b32 s0, 1
362-
; GFX-942-NEXT: v_and_or_b32 v5, v1, s0, v4
363-
; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
351+
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
364352
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
365-
; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s0
366-
; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5
353+
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
354+
; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4
367355
; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
368356
; GFX-942-NEXT: s_nop 1
369-
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
357+
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
370358
; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0
371359
; GFX-942-NEXT: s_endpgm
372360
;
373361
; GFX-950-LABEL: fptrunc_f64_to_bf16:
374362
; GFX-950: ; %bb.0: ; %entry
375-
; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
363+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
376364
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
377365
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
378-
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
379-
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
380-
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
366+
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
367+
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
368+
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
381369
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
382370
; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
383-
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
371+
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
384372
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
385-
; GFX-950-NEXT: s_brev_b32 s0, 1
386-
; GFX-950-NEXT: v_and_or_b32 v0, v1, s0, v0
387373
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
388374
; GFX-950-NEXT: flat_store_short v[2:3], v0
389375
; GFX-950-NEXT: s_endpgm
@@ -396,44 +382,38 @@ entry:
396382
define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
397383
; GFX-942-LABEL: fptrunc_f64_to_bf16_neg:
398384
; GFX-942: ; %bb.0: ; %entry
399-
; GFX-942-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
400-
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
401-
; GFX-942-NEXT: v_and_b32_e32 v8, 1, v7
402-
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
403-
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
404-
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
385+
; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1]
386+
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
387+
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
388+
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
389+
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], -v[0:1], v[4:5]
390+
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
405391
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
406-
; GFX-942-NEXT: v_add_u32_e32 v4, v7, v4
392+
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
407393
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
408-
; GFX-942-NEXT: s_brev_b32 s4, 1
409-
; GFX-942-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
410-
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
411-
; GFX-942-NEXT: v_and_or_b32 v5, v6, s4, v4
412-
; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
394+
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
395+
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
413396
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
414-
; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s0
415-
; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5
397+
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
398+
; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4
416399
; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
417400
; GFX-942-NEXT: s_nop 1
418-
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
401+
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
419402
; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0
420403
; GFX-942-NEXT: s_endpgm
421404
;
422405
; GFX-950-LABEL: fptrunc_f64_to_bf16_neg:
423406
; GFX-950: ; %bb.0: ; %entry
424-
; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
425-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
426-
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
427-
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
428-
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
429-
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
407+
; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1]
408+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
409+
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
410+
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
411+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], -v[0:1], v[4:5]
412+
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
430413
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
431-
; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
414+
; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
432415
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
433-
; GFX-950-NEXT: s_brev_b32 s4, 1
434-
; GFX-950-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
435-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
436-
; GFX-950-NEXT: v_and_or_b32 v0, v6, s4, v0
416+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
437417
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
438418
; GFX-950-NEXT: flat_store_short v[2:3], v0
439419
; GFX-950-NEXT: s_endpgm
@@ -447,44 +427,38 @@ entry:
447427
define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
448428
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
449429
; GFX-942: ; %bb.0: ; %entry
450-
; GFX-942-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
451-
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
452-
; GFX-942-NEXT: v_and_b32_e32 v8, 1, v7
453-
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
430+
; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
431+
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
432+
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
433+
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
454434
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
455-
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
435+
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
456436
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
457-
; GFX-942-NEXT: v_add_u32_e32 v4, v7, v4
437+
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
458438
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
459-
; GFX-942-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
460-
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
461-
; GFX-942-NEXT: s_brev_b32 s0, 1
462-
; GFX-942-NEXT: v_and_or_b32 v5, v6, s0, v4
463-
; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1
439+
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
440+
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
464441
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
465-
; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s0
466-
; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5
442+
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
443+
; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4
467444
; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
468445
; GFX-942-NEXT: s_nop 1
469-
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
446+
; GFX-942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
470447
; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0
471448
; GFX-942-NEXT: s_endpgm
472449
;
473450
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
474451
; GFX-950: ; %bb.0: ; %entry
475-
; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
476-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
477-
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
478-
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
452+
; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
453+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
454+
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
455+
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
479456
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
480-
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
457+
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
481458
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
482-
; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
459+
; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
483460
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
484-
; GFX-950-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
485-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
486-
; GFX-950-NEXT: s_brev_b32 s0, 1
487-
; GFX-950-NEXT: v_and_or_b32 v0, v6, s0, v0
461+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
488462
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
489463
; GFX-950-NEXT: flat_store_short v[2:3], v0
490464
; GFX-950-NEXT: s_endpgm

0 commit comments

Comments
 (0)