Skip to content

Commit 0605e98

Browse files
authored
[SDISel][Builder] Fix the instantiation of <1 x bfloat|half> (#94591)
Prior to this change, `SelectionDAGBuilder` was producing `SDNode`s of the form: `f32 = extract_vector_elt <1 x bfloat|half>, i32 0` when lowering phis of `<1 x bfloat|half>` and running on a target that promotes this type to `f32` (like some x86 or AMDGPU targets.) This construct is invalid since this type of node only allows type extensions for integer types. It went unotice because the `extract_vector_elt` node is later broken down in `bitcast` followed by `bf16_to_fp|fp_extend`. However, when the argument of the phi is a constant we were crashing because the existing code would try to constant fold this `extract_vector_elt` into a any_ext. This patch fixes this by using a proper decomposition for `<1 x bfloat|half>`: ``` bfloat|half = bitcast <1 x blfoat|half> float = fp_extend bfloat|half ``` This change should be NFC for the non-constant-folding cases and fix the SDISel crashes (reported in #94449) for the folding cases. Note: The change on the arm test is a missing fp16 to f32 constant folding exposed by this patch. I'll push a separate improvement for that.
1 parent b653357 commit 0605e98

File tree

4 files changed

+266
-3
lines changed

4 files changed

+266
-3
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,8 +729,17 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
729729
// prevents it from being picked up by the earlier bitcast case.
730730
if (ValueVT.getVectorElementCount().isScalar() &&
731731
(!ValueVT.isFloatingPoint() || !PartVT.isInteger())) {
732-
Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
733-
DAG.getVectorIdxConstant(0, DL));
732+
// If we reach this condition and PartVT is FP, this means that
733+
// ValueVT is also FP and both have a different size, otherwise we
734+
// would have bitcasted them. Producing an EXTRACT_VECTOR_ELT here
735+
// would be invalid since that would mean the smaller FP type has to
736+
// be extended to the larger one.
737+
if (PartVT.isFloatingPoint()) {
738+
Val = DAG.getBitcast(ValueVT.getScalarType(), Val);
739+
Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
740+
} else
741+
Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
742+
DAG.getVectorIdxConstant(0, DL));
734743
} else {
735744
uint64_t ValueSize = ValueVT.getFixedSizeInBits();
736745
assert(PartVT.getFixedSizeInBits() > ValueSize &&
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=hawaii %s -o - | FileCheck %s
3+
4+
; For all these tests we disable optimizations through function attributes
5+
; because the code we are exercising here needs phis and we want to keep the
6+
; IR small.
7+
8+
; This code used to crash in SDISel because f16 was promoted to f32 through
9+
; a `f32 = vector_extract_elt <1 x f16>, i32 0`, which is illegal.
10+
; The invalid SDNode and thus, the crash was only exposed by the constant
11+
; folding.
12+
define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
13+
; CHECK-LABEL: phi_vec1half_to_f32_with_const_folding:
14+
; CHECK: ; %bb.0: ; %entry
15+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16+
; CHECK-NEXT: s_mov_b32 s4, 0
17+
; CHECK-NEXT: v_cvt_f32_f16_e64 v2, s4
18+
; CHECK-NEXT: ; %bb.1: ; %bb
19+
; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v2
20+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
21+
; CHECK-NEXT: s_mov_b32 s6, 0
22+
; CHECK-NEXT: s_mov_b32 s4, s6
23+
; CHECK-NEXT: s_mov_b32 s5, s6
24+
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
25+
; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4
26+
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
27+
; CHECK-NEXT: s_waitcnt vmcnt(0)
28+
; CHECK-NEXT: s_setpc_b64 s[30:31]
29+
entry:
30+
br label %bb
31+
32+
bb:
33+
%phi = phi <1 x half> [ zeroinitializer, %entry ]
34+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
35+
store <2 x half> %res, ptr addrspace(1) %dst
36+
ret void
37+
}
38+
39+
; Same as phi_vec1half_to_f32_with_const_folding but without the folding.
40+
; This test exercises the same invalid SDNode, but it happened to work by
41+
; accident before. Here we make sure the fix also work as expected in the
42+
; non-constant folding case.
43+
define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 {
44+
; CHECK-LABEL: phi_vec1half_to_f32:
45+
; CHECK: ; %bb.0: ; %entry
46+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
48+
; CHECK-NEXT: s_mov_b32 s6, 0
49+
; CHECK-NEXT: s_mov_b32 s4, s6
50+
; CHECK-NEXT: s_mov_b32 s5, s6
51+
; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
52+
; CHECK-NEXT: s_waitcnt vmcnt(0)
53+
; CHECK-NEXT: v_cvt_f32_f16_e64 v0, v0
54+
; CHECK-NEXT: ; %bb.1: ; %bb
55+
; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0
56+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
57+
; CHECK-NEXT: s_mov_b32 s6, 0
58+
; CHECK-NEXT: s_mov_b32 s4, s6
59+
; CHECK-NEXT: s_mov_b32 s5, s6
60+
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
61+
; CHECK-NEXT: v_cvt_f16_f32_e64 v0, s4
62+
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
63+
; CHECK-NEXT: s_waitcnt vmcnt(0)
64+
; CHECK-NEXT: s_setpc_b64 s[30:31]
65+
entry:
66+
%input = load <1 x half>, ptr addrspace(1) %src
67+
br label %bb
68+
69+
bb:
70+
%phi = phi <1 x half> [ %input, %entry ]
71+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
72+
store <2 x half> %res, ptr addrspace(1) %dst
73+
ret void
74+
}
75+
76+
; Same as phi_vec1bf16_to_f32 but with bfloat instead of half.
77+
define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 {
78+
; CHECK-LABEL: phi_vec1bf16_to_f32:
79+
; CHECK: ; %bb.0: ; %entry
80+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
82+
; CHECK-NEXT: s_mov_b32 s6, 0
83+
; CHECK-NEXT: s_mov_b32 s4, s6
84+
; CHECK-NEXT: s_mov_b32 s5, s6
85+
; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
86+
; CHECK-NEXT: s_mov_b32 s4, 16
87+
; CHECK-NEXT: s_waitcnt vmcnt(0)
88+
; CHECK-NEXT: v_lshlrev_b32_e64 v0, s4, v0
89+
; CHECK-NEXT: ; %bb.1: ; %bb
90+
; CHECK-NEXT: v_mul_f32_e64 v0, 1.0, v0
91+
; CHECK-NEXT: s_mov_b32 s4, 16
92+
; CHECK-NEXT: v_lshrrev_b32_e64 v0, s4, v0
93+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
94+
; CHECK-NEXT: s_mov_b32 s6, 0
95+
; CHECK-NEXT: s_mov_b32 s4, s6
96+
; CHECK-NEXT: s_mov_b32 s5, s6
97+
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
98+
; CHECK-NEXT: s_waitcnt vmcnt(0)
99+
; CHECK-NEXT: s_setpc_b64 s[30:31]
100+
entry:
101+
%input = load <1 x bfloat>, ptr addrspace(1) %src
102+
br label %bb
103+
104+
bb:
105+
%phi = phi <1 x bfloat> [ %input, %entry ]
106+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
107+
store <2 x bfloat> %res, ptr addrspace(1) %dst
108+
ret void
109+
}
110+
111+
; Same as phi_vec1half_to_f32_with_const_folding but with bfloat instead of half.
112+
define void @phi_vec1bf16_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
113+
; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding:
114+
; CHECK: ; %bb.0: ; %entry
115+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116+
; CHECK-NEXT: s_mov_b32 s4, 0
117+
; CHECK-NEXT: ; %bb.1: ; %bb
118+
; CHECK-NEXT: v_mul_f32_e64 v2, 1.0, s4
119+
; CHECK-NEXT: s_mov_b32 s4, 16
120+
; CHECK-NEXT: v_lshrrev_b32_e32 v2, s4, v2
121+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
122+
; CHECK-NEXT: s_mov_b32 s6, 0
123+
; CHECK-NEXT: s_mov_b32 s4, s6
124+
; CHECK-NEXT: s_mov_b32 s5, s6
125+
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
126+
; CHECK-NEXT: s_waitcnt vmcnt(0)
127+
; CHECK-NEXT: s_setpc_b64 s[30:31]
128+
entry:
129+
br label %bb
130+
131+
bb:
132+
%phi = phi <1 x bfloat> [ zeroinitializer, %entry ]
133+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
134+
store <2 x bfloat> %res, ptr addrspace(1) %dst
135+
ret void
136+
}
137+
138+
attributes #0 = { noinline optnone }

llvm/test/CodeGen/ARM/arm-half-promote.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ define fastcc { <8 x half>, <8 x half> } @f3() {
116116

117117
define void @extract_insert(ptr %dst) optnone noinline {
118118
; CHECK-LABEL: extract_insert:
119-
; CHECK: vmov.i32 d0, #0x0
119+
; CHECK: movs r1, #0
120+
; CHECK: vmov s0, r1
121+
; CHECK: vcvtb.f32.f16 s0, s0
120122
; CHECK: vcvtb.f16.f32 s0, s0
121123
; CHECK: vmov r1, s0
122124
; CHECK: strh r1, [r0]
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mcpu=generic -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s
3+
4+
; For all these tests we disable optimizations through function attributes
5+
; because the code we are exercising here needs phis and we want to keep the
6+
; IR small.
7+
8+
; This code used to crash in SDISel because bf16 was promoted to f32 through
9+
; a `f32 = vector_extract_elt <1 x bf16>, i32 0`, which is illegal.
10+
; The invalid SDNode and thus, the crash was only exposed by the constant
11+
; folding.
12+
define void @phi_vec1bf16_to_f32_with_const_folding(ptr %dst) #0 {
13+
; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding:
14+
; CHECK: ## %bb.0: ## %entry
15+
; CHECK-NEXT: pushq %rbx
16+
; CHECK-NEXT: .cfi_def_cfa_offset 16
17+
; CHECK-NEXT: .cfi_offset %rbx, -16
18+
; CHECK-NEXT: movq %rdi, %rbx
19+
; CHECK-NEXT: jmp LBB0_1
20+
; CHECK-NEXT: LBB0_1: ## %bb
21+
; CHECK-NEXT: xorps %xmm0, %xmm0
22+
; CHECK-NEXT: callq ___truncsfbf2
23+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
24+
; CHECK-NEXT: movw %ax, 2(%rbx)
25+
; CHECK-NEXT: popq %rbx
26+
; CHECK-NEXT: retq
27+
entry:
28+
br label %bb
29+
30+
bb:
31+
%phi = phi <1 x bfloat> [ zeroinitializer, %entry ]
32+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
33+
store <2 x bfloat> %res, ptr %dst
34+
ret void
35+
}
36+
; Same as phi_vec1bf16_to_f32_with_const_folding but without the constant
37+
; folding.
38+
; This test exercises the same invalid SDNode, but it happened to work by
39+
; accident before. Here we make sure the fix also work as expected in the
40+
; non-constant folding case.
41+
define void @phi_vec1bf16_to_f32(ptr %src, ptr %dst) #0 {
42+
; CHECK-LABEL: phi_vec1bf16_to_f32:
43+
; CHECK: ## %bb.0: ## %entry
44+
; CHECK-NEXT: pushq %rbx
45+
; CHECK-NEXT: .cfi_def_cfa_offset 16
46+
; CHECK-NEXT: .cfi_offset %rbx, -16
47+
; CHECK-NEXT: movq %rsi, %rbx
48+
; CHECK-NEXT: movzwl (%rdi), %eax
49+
; CHECK-NEXT: shll $16, %eax
50+
; CHECK-NEXT: movd %eax, %xmm0
51+
; CHECK-NEXT: jmp LBB1_1
52+
; CHECK-NEXT: LBB1_1: ## %bb
53+
; CHECK-NEXT: callq ___truncsfbf2
54+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
55+
; CHECK-NEXT: movw %ax, 2(%rbx)
56+
; CHECK-NEXT: popq %rbx
57+
; CHECK-NEXT: retq
58+
entry:
59+
%input = load <1 x bfloat>, ptr %src
60+
br label %bb
61+
62+
bb:
63+
%phi = phi <1 x bfloat> [ %input, %entry ]
64+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
65+
store <2 x bfloat> %res, ptr %dst
66+
ret void
67+
}
68+
69+
70+
; Half type is legal on x86 so nothing special here, but it
71+
; doesn't hurt to be thorough.
72+
define void @phi_vec1half_with_const_folding(ptr %dst) #0 {
73+
; CHECK-LABEL: phi_vec1half_with_const_folding:
74+
; CHECK: ## %bb.0: ## %entry
75+
; CHECK-NEXT: xorps %xmm0, %xmm0
76+
; CHECK-NEXT: jmp LBB2_1
77+
; CHECK-NEXT: LBB2_1: ## %bb
78+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
79+
; CHECK-NEXT: movw %ax, 2(%rdi)
80+
; CHECK-NEXT: retq
81+
entry:
82+
br label %bb
83+
84+
bb:
85+
%phi = phi <1 x half> [ zeroinitializer, %entry ]
86+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
87+
store <2 x half> %res, ptr %dst
88+
ret void
89+
}
90+
91+
; Half type is legal on x86 so nothing special here, but it
92+
; doesn't hurt to be thorough.
93+
; Same as phi_vec1half_with_constant_folding but without the constant folding.
94+
define void @phi_vec1half(ptr %src, ptr %dst) #0 {
95+
; CHECK-LABEL: phi_vec1half:
96+
; CHECK: ## %bb.0: ## %entry
97+
; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0
98+
; CHECK-NEXT: jmp LBB3_1
99+
; CHECK-NEXT: LBB3_1: ## %bb
100+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
101+
; CHECK-NEXT: movw %ax, 2(%rsi)
102+
; CHECK-NEXT: retq
103+
entry:
104+
%input = load <1 x half>, ptr %src
105+
br label %bb
106+
107+
bb:
108+
%phi = phi <1 x half> [ %input, %entry ]
109+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
110+
store <2 x half> %res, ptr %dst
111+
ret void
112+
}
113+
114+
attributes #0 = { noinline optnone }

0 commit comments

Comments
 (0)