Skip to content

Commit 59ef2b8

Browse files
alexey-bataevgithub-actions[bot]
authored andcommitted
Automerge: Revert "[SLP]Improved reduction cost/codegen"
This reverts commit 2ad8166 to fix bug/miscompiles, reported in llvm/llvm-project#118293 (comment) and llvm/llvm-project#118293 (comment).
2 parents 1226125 + afa3c10 commit 59ef2b8

23 files changed

+209
-471
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 32 additions & 258 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ define void @foo(ptr %0) {
1919
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
2020
; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer
2121
; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer
22-
; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i1> [[TMP5]], [[TMP10]]
23-
; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_OP]])
22+
; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
23+
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
24+
; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP11]], [[TMP12]]
2425
; CHECK-NEXT: br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]]
2526
; CHECK: vector.ph:
2627
; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,10 @@ define half @reduce_fast_half8(<8 x half> %vec8) {
8181
; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
8282
; NOFP16-NEXT: [[ENTRY:.*:]]
8383
; NOFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
84+
; NOFP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
8485
; NOFP16-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
85-
; NOFP16-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x half> [[TMP0]], [[TMP2]]
86-
; NOFP16-NEXT: [[OP_RDX3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[RDX_OP]])
86+
; NOFP16-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
87+
; NOFP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
8788
; NOFP16-NEXT: ret half [[OP_RDX3]]
8889
;
8990
; FULLFP16-LABEL: define half @reduce_fast_half8(

llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,10 @@ define half @reduction_half16(<16 x half> %vec16) {
5757
; VI-LABEL: @reduction_half16(
5858
; VI-NEXT: entry:
5959
; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
60+
; VI-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[TMP0]])
6061
; VI-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
61-
; VI-NEXT: [[RDX_OP:%.*]] = fadd fast <8 x half> [[TMP0]], [[TMP2]]
62-
; VI-NEXT: [[OP_RDX:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[RDX_OP]])
62+
; VI-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[TMP2]])
63+
; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
6364
; VI-NEXT: ret half [[OP_RDX]]
6465
;
6566
entry:

llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
; YAML-NEXT: Function: test
1919
; YAML-NEXT: Args:
2020
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
21-
; YAML-NEXT: - Cost: '-15'
21+
; YAML-NEXT: - Cost: '-14'
2222
; YAML-NEXT: - String: ' and with tree size '
2323
; YAML-NEXT: - TreeSize: '1'
2424
; YAML-NEXT: ...
@@ -28,7 +28,7 @@
2828
; YAML-NEXT: Function: test
2929
; YAML-NEXT: Args:
3030
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
31-
; YAML-NEXT: - Cost: '-6'
31+
; YAML-NEXT: - Cost: '-4'
3232
; YAML-NEXT: - String: ' and with tree size '
3333
; YAML-NEXT: - TreeSize: '1'
3434
; YAML-NEXT:...
@@ -45,13 +45,11 @@ define float @test(ptr %x) {
4545
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
4646
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
4747
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
48-
; CHECK-NEXT: [[TMP5:%.*]] = call fast <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float> [[TMP0]], i64 0)
49-
; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <8 x float> [[TMP5]], [[TMP1]]
50-
; CHECK-NEXT: [[TMP6:%.*]] = call fast <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> [[RDX_OP]], i64 0)
51-
; CHECK-NEXT: [[RDX_OP4:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v16f32(<16 x float> [[TMP6]], i64 0)
52-
; CHECK-NEXT: [[RDX_OP5:%.*]] = fadd fast <4 x float> [[RDX_OP4]], [[TMP2]]
53-
; CHECK-NEXT: [[TMP8:%.*]] = call fast <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> [[RDX_OP5]], i64 0)
54-
; CHECK-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP8]])
48+
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
49+
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
50+
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
51+
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
52+
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
5553
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
5654
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
5755
; CHECK-NEXT: ret float [[OP_RDX3]]

llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -341,41 +341,44 @@ define void @reduce_or_2() {
341341
; ZVFHMIN-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
342342
; ZVFHMIN-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
343343
; ZVFHMIN-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
344-
; ZVFHMIN-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]]
345-
; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]])
344+
; ZVFHMIN-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
345+
; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
346+
; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
346347
; ZVFHMIN-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
347-
; ZVFHMIN: 7:
348-
; ZVFHMIN-NEXT: ret void
349348
; ZVFHMIN: 8:
350349
; ZVFHMIN-NEXT: ret void
350+
; ZVFHMIN: 9:
351+
; ZVFHMIN-NEXT: ret void
351352
;
352353
; ZVL128-LABEL: @reduce_or_2(
353354
; ZVL128-NEXT: [[TMP1:%.*]] = shl i64 0, 0
354355
; ZVL128-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
355356
; ZVL128-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
356357
; ZVL128-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
357358
; ZVL128-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
358-
; ZVL128-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]]
359-
; ZVL128-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]])
359+
; ZVL128-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
360+
; ZVL128-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
361+
; ZVL128-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
360362
; ZVL128-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
361-
; ZVL128: 7:
362-
; ZVL128-NEXT: ret void
363363
; ZVL128: 8:
364364
; ZVL128-NEXT: ret void
365+
; ZVL128: 9:
366+
; ZVL128-NEXT: ret void
365367
;
366368
; ZVL256-LABEL: @reduce_or_2(
367369
; ZVL256-NEXT: [[TMP1:%.*]] = shl i64 0, 0
368370
; ZVL256-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
369371
; ZVL256-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
370372
; ZVL256-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
371373
; ZVL256-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
372-
; ZVL256-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]]
373-
; ZVL256-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]])
374+
; ZVL256-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
375+
; ZVL256-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
376+
; ZVL256-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
374377
; ZVL256-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
375-
; ZVL256: 7:
376-
; ZVL256-NEXT: ret void
377378
; ZVL256: 8:
378379
; ZVL256-NEXT: ret void
380+
; ZVL256: 9:
381+
; ZVL256-NEXT: ret void
379382
;
380383
; ZVL512-LABEL: @reduce_or_2(
381384
; ZVL512-NEXT: [[TMP1:%.*]] = shl i64 0, 0

0 commit comments

Comments
 (0)