Skip to content

Commit eb48e11

Browse files
authored
[ARM] Fix undefined behaviour in bf16->float conversion (#116985)
This was implementing the bf16->float conversion function using a left-shift of a signed integer, so for negative floating-point values a 1 was being shifted into the sign bit of the signed integer intermediate value. This is undefined behaviour, and was caught by UBSan. The vector versions are code-generated via Neon builtin functions, so probably don't have the same UB problem, but I've updated them anyway to be consistent. Fixes #61983.
1 parent 00d383e commit eb48e11

File tree

2 files changed

+76
-76
lines changed

2 files changed

+76
-76
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def OP_BFMLALT_LN
252252

253253
def OP_VCVT_F32_BF16
254254
: Op<(bitcast "R",
255-
(call "vshll_n", (bitcast "int16x4_t", $p0),
255+
(call "vshll_n", (bitcast "uint16x4_t", $p0),
256256
(literal "int32_t", "16")))>;
257257
def OP_VCVT_F32_BF16_LO
258258
: Op<(call "vcvt_f32_bf16", (call "vget_low", $p0))>;
@@ -275,8 +275,8 @@ def OP_VCVT_BF16_F32_HI_A32
275275
(call "vget_low", $p0))>;
276276

277277
def OP_CVT_F32_BF16
278-
: Op<(bitcast "R", (op "<<", (cast "int32_t", (bitcast "int16_t", $p0)),
279-
(literal "int32_t", "16")))>;
278+
: Op<(bitcast "R", (op "<<", (cast "uint32_t", (bitcast "uint16_t", $p0)),
279+
(literal "uint32_t", "16")))>;
280280

281281
//===----------------------------------------------------------------------===//
282282
// Auxiliary Instructions

clang/test/CodeGen/arm-bf16-convert-intrinsics.c

Lines changed: 73 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -24,50 +24,50 @@
2424

2525
// CHECK-A64-LABEL: @test_vcvt_f32_bf16(
2626
// CHECK-A64-NEXT: entry:
27-
// CHECK-A64-NEXT: [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
28-
// CHECK-A64-NEXT: [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 16
29-
// CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_836_I]], align 8
30-
// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
27+
// CHECK-A64-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
28+
// CHECK-A64-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 16
29+
// CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8
30+
// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
3131
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
32-
// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
32+
// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
3333
// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
34-
// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 16
35-
// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 16
34+
// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 16
35+
// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 16
3636
// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
3737
//
3838
// CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
3939
// CHECK-A32-HARDFP-NEXT: entry:
40-
// CHECK-A32-HARDFP-NEXT: [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
41-
// CHECK-A32-HARDFP-NEXT: [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 8
42-
// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_836_I]], align 8
43-
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
40+
// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
41+
// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8
42+
// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8
43+
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
4444
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
45-
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
45+
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
4646
// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
47-
// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 8
48-
// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 8
47+
// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8
48+
// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8
4949
// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
5050
//
5151
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
5252
// CHECK-A32-SOFTFP-NEXT: entry:
53-
// CHECK-A32-SOFTFP-NEXT: [[__P0_836_I:%.*]] = alloca <4 x bfloat>, align 8
54-
// CHECK-A32-SOFTFP-NEXT: [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
55-
// CHECK-A32-SOFTFP-NEXT: [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 8
53+
// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I:%.*]] = alloca <4 x bfloat>, align 8
54+
// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
55+
// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8
5656
// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <4 x bfloat>, align 8
5757
// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
5858
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
5959
// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8
6060
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8
6161
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8
62-
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP0]], ptr [[__P0_836_I]], align 8
63-
// CHECK-A32-SOFTFP-NEXT: [[__P0_8361_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I]], align 8
64-
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8361_I]], ptr [[__REINT_836_I]], align 8
65-
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
62+
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP0]], ptr [[__P0_808_I]], align 8
63+
// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I]], align 8
64+
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I]], ptr [[__REINT_808_I]], align 8
65+
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
6666
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
67-
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
67+
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
6868
// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
69-
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 8
70-
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 8
69+
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8
70+
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8
7171
// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP4]]
7272
//
7373
float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
@@ -76,39 +76,39 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
7676

7777
// CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16(
7878
// CHECK-A64-NEXT: entry:
79-
// CHECK-A64-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
80-
// CHECK-A64-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 16
79+
// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
80+
// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16
8181
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
82-
// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
83-
// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
82+
// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
83+
// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
8484
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
85-
// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
85+
// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
8686
// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
87-
// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 16
88-
// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 16
87+
// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16
88+
// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16
8989
// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
9090
//
9191
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
9292
// CHECK-A32-HARDFP-NEXT: entry:
93-
// CHECK-A32-HARDFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
94-
// CHECK-A32-HARDFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
93+
// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
94+
// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
9595
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
96-
// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
97-
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
96+
// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
97+
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
9898
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
99-
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
99+
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
100100
// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
101-
// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
102-
// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
101+
// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
102+
// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
103103
// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
104104
//
105105
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
106106
// CHECK-A32-SOFTFP-NEXT: entry:
107107
// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
108108
// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
109-
// CHECK-A32-SOFTFP-NEXT: [[__P0_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
110-
// CHECK-A32-SOFTFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
111-
// CHECK-A32-SOFTFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
109+
// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
110+
// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
111+
// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
112112
// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
113113
// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
114114
// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
@@ -132,15 +132,15 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
132132
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
133133
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
134134
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
135-
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_836_I_I]], align 8
136-
// CHECK-A32-SOFTFP-NEXT: [[__P0_8361_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I_I]], align 8
137-
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8361_I_I]], ptr [[__REINT_836_I_I]], align 8
138-
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
135+
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8
136+
// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8
137+
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8
138+
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
139139
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
140-
// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
140+
// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
141141
// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16)
142-
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
143-
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
142+
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
143+
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
144144
// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]]
145145
//
146146
float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
@@ -149,39 +149,39 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
149149

150150
// CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16(
151151
// CHECK-A64-NEXT: entry:
152-
// CHECK-A64-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
153-
// CHECK-A64-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 16
152+
// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
153+
// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16
154154
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
155-
// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
156-
// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
155+
// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
156+
// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
157157
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
158-
// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
158+
// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
159159
// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
160-
// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 16
161-
// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 16
160+
// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16
161+
// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16
162162
// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
163163
//
164164
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
165165
// CHECK-A32-HARDFP-NEXT: entry:
166-
// CHECK-A32-HARDFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
167-
// CHECK-A32-HARDFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
166+
// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
167+
// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
168168
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169-
// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
170-
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
169+
// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
170+
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
171171
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
172-
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
172+
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
173173
// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
174-
// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
175-
// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
174+
// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
175+
// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
176176
// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
177177
//
178178
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
179179
// CHECK-A32-SOFTFP-NEXT: entry:
180180
// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
181181
// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
182-
// CHECK-A32-SOFTFP-NEXT: [[__P0_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
183-
// CHECK-A32-SOFTFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
184-
// CHECK-A32-SOFTFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
182+
// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
183+
// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
184+
// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
185185
// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
186186
// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
187187
// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
@@ -205,15 +205,15 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
205205
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
206206
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
207207
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
208-
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_836_I_I]], align 8
209-
// CHECK-A32-SOFTFP-NEXT: [[__P0_8361_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I_I]], align 8
210-
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8361_I_I]], ptr [[__REINT_836_I_I]], align 8
211-
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
208+
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8
209+
// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8
210+
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8
211+
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
212212
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
213-
// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
213+
// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
214214
// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16)
215-
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
216-
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
215+
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
216+
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
217217
// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]]
218218
//
219219
float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
@@ -427,7 +427,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
427427
// CHECK-NEXT: [[__REINT1_I:%.*]] = alloca i32, align 4
428428
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
429429
// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2
430-
// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
430+
// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
431431
// CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
432432
// CHECK-NEXT: store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
433433
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4

0 commit comments

Comments
 (0)