Skip to content

Commit 7d2fb8d

Browse files
PeddleSpamLeon Clark
authored andcommitted
[X86][AVX] Match v4f64 blend from shuffle of scalar values. (llvm#135753)
Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that will lower to AVX blend. This addresses a regression in llvm#128938. --------- Co-authored-by: Leon Clark <[email protected]>
1 parent e696365 commit 7d2fb8d

File tree

4 files changed

+270
-69
lines changed

4 files changed

+270
-69
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "X86TargetMachine.h"
2222
#include "llvm/ADT/SmallBitVector.h"
2323
#include "llvm/ADT/SmallSet.h"
24+
#include "llvm/ADT/SmallVector.h"
2425
#include "llvm/ADT/Statistic.h"
2526
#include "llvm/ADT/StringExtras.h"
2627
#include "llvm/ADT/StringSwitch.h"
@@ -37,6 +38,7 @@
3738
#include "llvm/CodeGen/MachineModuleInfo.h"
3839
#include "llvm/CodeGen/MachineRegisterInfo.h"
3940
#include "llvm/CodeGen/SDPatternMatch.h"
41+
#include "llvm/CodeGen/SelectionDAGNodes.h"
4042
#include "llvm/CodeGen/TargetLowering.h"
4143
#include "llvm/CodeGen/WinEHFuncInfo.h"
4244
#include "llvm/IR/CallingConv.h"
@@ -8783,6 +8785,52 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
87838785
return LowerShift(Res, Subtarget, DAG);
87848786
}
87858787

8788+
static bool isShuffleFoldableLoad(SDValue);
8789+
8790+
/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8791+
/// representing a blend.
8792+
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
8793+
X86Subtarget const &Subtarget,
8794+
SelectionDAG &DAG) {
8795+
MVT VT = BVOp->getSimpleValueType(0u);
8796+
8797+
if (VT != MVT::v4f64)
8798+
return SDValue();
8799+
8800+
// Collect unique operands.
8801+
auto UniqueOps = SmallSet<SDValue, 16u>();
8802+
for (SDValue Op : BVOp->ops()) {
8803+
if (isIntOrFPConstant(Op) || Op.isUndef())
8804+
return SDValue();
8805+
UniqueOps.insert(Op);
8806+
}
8807+
8808+
// Candidate BUILD_VECTOR must have 2 unique operands.
8809+
if (UniqueOps.size() != 2u)
8810+
return SDValue();
8811+
8812+
SDValue Op0 = BVOp->getOperand(0u);
8813+
UniqueOps.erase(Op0);
8814+
SDValue Op1 = *UniqueOps.begin();
8815+
8816+
if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8817+
isShuffleFoldableLoad(Op1)) {
8818+
// Create shuffle mask.
8819+
auto const NumElems = VT.getVectorNumElements();
8820+
SmallVector<int, 16u> Mask(NumElems);
8821+
for (auto I = 0u; I < NumElems; ++I) {
8822+
SDValue Op = BVOp->getOperand(I);
8823+
Mask[I] = Op == Op0 ? I : I + NumElems;
8824+
}
8825+
// Create shuffle of splats.
8826+
SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8827+
SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8828+
return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8829+
}
8830+
8831+
return SDValue();
8832+
}
8833+
87868834
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
87878835
/// functionality to do this, so it's all zeros, all ones, or some derivation
87888836
/// that is cheap to calculate.
@@ -9245,6 +9293,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
92459293
return Broadcast;
92469294
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
92479295
return BitOp;
9296+
if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9297+
return Blend;
92489298

92499299
unsigned NumZero = ZeroMask.popcount();
92509300
unsigned NumNonZero = NonZeroMask.popcount();

llvm/test/CodeGen/X86/build-vector-256.ll

Lines changed: 61 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -415,20 +415,34 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
415415
; build vectors of repeated elements
416416

417417
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
418-
; AVX-32-LABEL: test_buildvector_4f64_2_var:
419-
; AVX-32: # %bb.0:
420-
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
421-
; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
422-
; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
423-
; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
424-
; AVX-32-NEXT: retl
418+
; AVX1-32-LABEL: test_buildvector_4f64_2_var:
419+
; AVX1-32: # %bb.0:
420+
; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
421+
; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
422+
; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
423+
; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
424+
; AVX1-32-NEXT: retl
425425
;
426-
; AVX-64-LABEL: test_buildvector_4f64_2_var:
427-
; AVX-64: # %bb.0:
428-
; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
429-
; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
430-
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
431-
; AVX-64-NEXT: retq
426+
; AVX1-64-LABEL: test_buildvector_4f64_2_var:
427+
; AVX1-64: # %bb.0:
428+
; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
429+
; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
430+
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
431+
; AVX1-64-NEXT: retq
432+
;
433+
; AVX2-32-LABEL: test_buildvector_4f64_2_var:
434+
; AVX2-32: # %bb.0:
435+
; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
436+
; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1
437+
; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
438+
; AVX2-32-NEXT: retl
439+
;
440+
; AVX2-64-LABEL: test_buildvector_4f64_2_var:
441+
; AVX2-64: # %bb.0:
442+
; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1
443+
; AVX2-64-NEXT: vbroadcastsd %xmm0, %ymm0
444+
; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
445+
; AVX2-64-NEXT: retq
432446
%v0 = insertelement <4 x double> poison, double %a0, i32 0
433447
%v1 = insertelement <4 x double> %v0, double %a1, i32 1
434448
%v2 = insertelement <4 x double> %v1, double %a1, i32 2
@@ -437,25 +451,41 @@ define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
437451
}
438452

439453
define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) {
440-
; AVX-32-LABEL: test_buildvector_4f64_2_load:
441-
; AVX-32: # %bb.0:
442-
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
443-
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
444-
; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
445-
; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
446-
; AVX-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
447-
; AVX-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
448-
; AVX-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
449-
; AVX-32-NEXT: retl
454+
; AVX1-32-LABEL: test_buildvector_4f64_2_load:
455+
; AVX1-32: # %bb.0:
456+
; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %eax
457+
; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
458+
; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
459+
; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
460+
; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
461+
; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
462+
; AVX1-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
463+
; AVX1-32-NEXT: retl
450464
;
451-
; AVX-64-LABEL: test_buildvector_4f64_2_load:
452-
; AVX-64: # %bb.0:
453-
; AVX-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
454-
; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
455-
; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
456-
; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
457-
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
458-
; AVX-64-NEXT: retq
465+
; AVX1-64-LABEL: test_buildvector_4f64_2_load:
466+
; AVX1-64: # %bb.0:
467+
; AVX1-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
468+
; AVX1-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
469+
; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
470+
; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
471+
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
472+
; AVX1-64-NEXT: retq
473+
;
474+
; AVX2-32-LABEL: test_buildvector_4f64_2_load:
475+
; AVX2-32: # %bb.0:
476+
; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %eax
477+
; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
478+
; AVX2-32-NEXT: vbroadcastsd (%ecx), %ymm0
479+
; AVX2-32-NEXT: vbroadcastsd (%eax), %ymm1
480+
; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
481+
; AVX2-32-NEXT: retl
482+
;
483+
; AVX2-64-LABEL: test_buildvector_4f64_2_load:
484+
; AVX2-64: # %bb.0:
485+
; AVX2-64-NEXT: vbroadcastsd (%rsi), %ymm0
486+
; AVX2-64-NEXT: vbroadcastsd (%rdi), %ymm1
487+
; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
488+
; AVX2-64-NEXT: retq
459489
%a0 = load double, ptr %p0
460490
%a1 = load double, ptr %p1
461491
%v0 = insertelement <4 x double> poison, double %a0, i32 0

llvm/test/CodeGen/X86/build-vector-512.ll

Lines changed: 68 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-32
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-64
4-
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-32
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-64
2+
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-32,AVX512F-32
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-64,AVX512F-64
4+
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512BW-32
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512BW-64
66

77
define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) {
88
; AVX-32-LABEL: test_buildvector_v8f64:
@@ -480,23 +480,37 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
480480
; build vectors of repeated elements
481481

482482
define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
483-
; AVX-32-LABEL: test_buildvector_8f64_2_var:
484-
; AVX-32: # %bb.0:
485-
; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
486-
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm1
487-
; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
488-
; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
489-
; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
490-
; AVX-32-NEXT: retl
483+
; AVX512F-32-LABEL: test_buildvector_8f64_2_var:
484+
; AVX512F-32: # %bb.0:
485+
; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
486+
; AVX512F-32-NEXT: movb $-126, %al
487+
; AVX512F-32-NEXT: kmovw %eax, %k1
488+
; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
489+
; AVX512F-32-NEXT: retl
491490
;
492-
; AVX-64-LABEL: test_buildvector_8f64_2_var:
493-
; AVX-64: # %bb.0:
494-
; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
495-
; AVX-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
496-
; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
497-
; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
498-
; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
499-
; AVX-64-NEXT: retq
491+
; AVX512F-64-LABEL: test_buildvector_8f64_2_var:
492+
; AVX512F-64: # %bb.0:
493+
; AVX512F-64-NEXT: vbroadcastsd %xmm0, %zmm0
494+
; AVX512F-64-NEXT: movb $-126, %al
495+
; AVX512F-64-NEXT: kmovw %eax, %k1
496+
; AVX512F-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
497+
; AVX512F-64-NEXT: retq
498+
;
499+
; AVX512BW-32-LABEL: test_buildvector_8f64_2_var:
500+
; AVX512BW-32: # %bb.0:
501+
; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
502+
; AVX512BW-32-NEXT: movb $-126, %al
503+
; AVX512BW-32-NEXT: kmovd %eax, %k1
504+
; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
505+
; AVX512BW-32-NEXT: retl
506+
;
507+
; AVX512BW-64-LABEL: test_buildvector_8f64_2_var:
508+
; AVX512BW-64: # %bb.0:
509+
; AVX512BW-64-NEXT: vbroadcastsd %xmm0, %zmm0
510+
; AVX512BW-64-NEXT: movb $-126, %al
511+
; AVX512BW-64-NEXT: kmovd %eax, %k1
512+
; AVX512BW-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
513+
; AVX512BW-64-NEXT: retq
500514
%v0 = insertelement <8 x double> poison, double %a0, i32 0
501515
%v1 = insertelement <8 x double> %v0, double %a1, i32 1
502516
%v2 = insertelement <8 x double> %v1, double %a0, i32 2
@@ -509,25 +523,41 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
509523
}
510524

511525
define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) {
512-
; AVX-32-LABEL: test_buildvector_8f64_2_load:
513-
; AVX-32: # %bb.0:
514-
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
515-
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
516-
; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
517-
; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
518-
; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
519-
; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
520-
; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
521-
; AVX-32-NEXT: retl
526+
; AVX512F-32-LABEL: test_buildvector_8f64_2_load:
527+
; AVX512F-32: # %bb.0:
528+
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
529+
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
530+
; AVX512F-32-NEXT: vbroadcastsd (%ecx), %zmm0
531+
; AVX512F-32-NEXT: movb $-126, %cl
532+
; AVX512F-32-NEXT: kmovw %ecx, %k1
533+
; AVX512F-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1}
534+
; AVX512F-32-NEXT: retl
522535
;
523-
; AVX-64-LABEL: test_buildvector_8f64_2_load:
524-
; AVX-64: # %bb.0:
525-
; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
526-
; AVX-64-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
527-
; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
528-
; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
529-
; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
530-
; AVX-64-NEXT: retq
536+
; AVX512F-64-LABEL: test_buildvector_8f64_2_load:
537+
; AVX512F-64: # %bb.0:
538+
; AVX512F-64-NEXT: vbroadcastsd (%rdi), %zmm0
539+
; AVX512F-64-NEXT: movb $-126, %al
540+
; AVX512F-64-NEXT: kmovw %eax, %k1
541+
; AVX512F-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1}
542+
; AVX512F-64-NEXT: retq
543+
;
544+
; AVX512BW-32-LABEL: test_buildvector_8f64_2_load:
545+
; AVX512BW-32: # %bb.0:
546+
; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %eax
547+
; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
548+
; AVX512BW-32-NEXT: vbroadcastsd (%ecx), %zmm0
549+
; AVX512BW-32-NEXT: movb $-126, %cl
550+
; AVX512BW-32-NEXT: kmovd %ecx, %k1
551+
; AVX512BW-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1}
552+
; AVX512BW-32-NEXT: retl
553+
;
554+
; AVX512BW-64-LABEL: test_buildvector_8f64_2_load:
555+
; AVX512BW-64: # %bb.0:
556+
; AVX512BW-64-NEXT: vbroadcastsd (%rdi), %zmm0
557+
; AVX512BW-64-NEXT: movb $-126, %al
558+
; AVX512BW-64-NEXT: kmovd %eax, %k1
559+
; AVX512BW-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1}
560+
; AVX512BW-64-NEXT: retq
531561
%a0 = load double, ptr %p0
532562
%a1 = load double, ptr %p1
533563
%v0 = insertelement <8 x double> poison, double %a0, i32 0

0 commit comments

Comments
 (0)