Skip to content

Commit 7e6606f

Browse files
committed
[X86][SSE] Add general memory folding for (V)INSERTPS instruction
This patch improves the memory folding of the inserted float element for the (V)INSERTPS instruction. The existing implementation occurs in the DAGCombiner and relies on the narrowing of a whole vector load into a scalar load (and then converted into a vector) to (hopefully) allow folding to occur later on. Not only has this proven problematic for debug builds, it also prevents other memory folds (notably stack reloads) from happening. This patch removes the old implementation and moves the folding code to the X86 foldMemoryOperand handler. A new private 'special case' function - foldMemoryOperandCustom - has been added to deal with memory folding of instructions that can't just use the lookup tables - (V)INSERTPS is the first of several that could be done. It also tweaks the memory operand folding code with an additional pointer offset that allows existing memory addresses to be modified, in this case to convert the vector address to the explicit address of the scalar element that will be inserted. Unlike the previous implementation we now set the insertion source index to zero, although this is ignored for the (V)INSERTPSrm version, anything that relied on shuffle decodes (such as unfolding of insertps loads) was incorrectly calculating the source address - I've added a test for this at insertps-unfold-load-bug.ll Differential Revision: http://reviews.llvm.org/D13988 llvm-svn: 252074
1 parent b11b440 commit 7e6606f

8 files changed

+141
-71
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -26174,52 +26174,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
2617426174
return SDValue();
2617526175
}
2617626176

26177-
static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
26178-
SelectionDAG &DAG) {
26179-
SDLoc dl(Load);
26180-
MVT VT = Load->getSimpleValueType(0);
26181-
MVT EVT = VT.getVectorElementType();
26182-
SDValue Addr = Load->getOperand(1);
26183-
SDValue NewAddr = DAG.getNode(
26184-
ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
26185-
DAG.getConstant(Index * EVT.getStoreSize(), dl,
26186-
Addr.getSimpleValueType()));
26187-
26188-
SDValue NewLoad =
26189-
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
26190-
DAG.getMachineFunction().getMachineMemOperand(
26191-
Load->getMemOperand(), 0, EVT.getStoreSize()));
26192-
return NewLoad;
26193-
}
26194-
26195-
static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
26196-
const X86Subtarget *Subtarget) {
26197-
SDLoc dl(N);
26198-
MVT VT = N->getOperand(1)->getSimpleValueType(0);
26199-
assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
26200-
"X86insertps is only defined for v4x32");
26201-
26202-
SDValue Ld = N->getOperand(1);
26203-
if (MayFoldLoad(Ld)) {
26204-
// Extract the countS bits from the immediate so we can get the proper
26205-
// address when narrowing the vector load to a specific element.
26206-
// When the second source op is a memory address, insertps doesn't use
26207-
// countS and just gets an f32 from that address.
26208-
unsigned DestIndex =
26209-
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
26210-
26211-
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
26212-
26213-
// Create this as a scalar to vector to match the instruction pattern.
26214-
SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
26215-
// countS bits are ignored when loading from memory on insertps, which
26216-
// means we don't need to explicitly set them to 0.
26217-
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
26218-
LoadScalarToVector, N->getOperand(2));
26219-
}
26220-
return SDValue();
26221-
}
26222-
2622326177
static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
2622426178
SDValue V0 = N->getOperand(0);
2622526179
SDValue V1 = N->getOperand(1);
@@ -26685,11 +26639,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
2668526639
case X86ISD::VPERM2X128:
2668626640
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
2668726641
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
26688-
case X86ISD::INSERTPS: {
26689-
if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26690-
return PerformINSERTPSCombine(N, DAG, Subtarget);
26691-
break;
26692-
}
2669326642
case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
2669426643
}
2669526644

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4876,12 +4876,35 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
48764876
return false;
48774877
}
48784878

4879-
static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
4879+
static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
4880+
int PtrOffset = 0) {
48804881
unsigned NumAddrOps = MOs.size();
4881-
for (unsigned i = 0; i != NumAddrOps; ++i)
4882-
MIB.addOperand(MOs[i]);
4883-
if (NumAddrOps < 4) // FrameIndex only
4884-
addOffset(MIB, 0);
4882+
4883+
if (NumAddrOps < 4) {
4884+
// FrameIndex only - add an immediate offset (whether its zero or not).
4885+
for (unsigned i = 0; i != NumAddrOps; ++i)
4886+
MIB.addOperand(MOs[i]);
4887+
addOffset(MIB, PtrOffset);
4888+
} else {
4889+
// General Memory Addressing - we need to add any offset to an existing
4890+
// offset.
4891+
assert(MOs.size() == 5 && "Unexpected memory operand list length");
4892+
for (unsigned i = 0; i != NumAddrOps; ++i) {
4893+
const MachineOperand &MO = MOs[i];
4894+
if (i == 3 && PtrOffset != 0) {
4895+
assert((MO.isImm() || MO.isGlobal()) &&
4896+
"Unexpected memory operand type");
4897+
if (MO.isImm()) {
4898+
MIB.addImm(MO.getImm() + PtrOffset);
4899+
} else {
4900+
MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset() + PtrOffset,
4901+
MO.getTargetFlags());
4902+
}
4903+
} else {
4904+
MIB.addOperand(MO);
4905+
}
4906+
}
4907+
}
48854908
}
48864909

48874910
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
@@ -4916,7 +4939,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
49164939
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
49174940
unsigned OpNo, ArrayRef<MachineOperand> MOs,
49184941
MachineBasicBlock::iterator InsertPt,
4919-
MachineInstr *MI, const TargetInstrInfo &TII) {
4942+
MachineInstr *MI, const TargetInstrInfo &TII,
4943+
int PtrOffset = 0) {
49204944
// Omit the implicit operands, something BuildMI can't do.
49214945
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
49224946
MI->getDebugLoc(), true);
@@ -4926,7 +4950,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
49264950
MachineOperand &MO = MI->getOperand(i);
49274951
if (i == OpNo) {
49284952
assert(MO.isReg() && "Expected to fold into reg operand!");
4929-
addOperands(MIB, MOs);
4953+
addOperands(MIB, MOs, PtrOffset);
49304954
} else {
49314955
MIB.addOperand(MO);
49324956
}
@@ -4948,6 +4972,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
49484972
return MIB.addImm(0);
49494973
}
49504974

4975+
MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
4976+
MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
4977+
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
4978+
unsigned Size, unsigned Align) const {
4979+
switch (MI->getOpcode()) {
4980+
case X86::INSERTPSrr:
4981+
case X86::VINSERTPSrr:
4982+
// Attempt to convert the load of inserted vector into a fold load
4983+
// of a single float.
4984+
if (OpNum == 2) {
4985+
unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
4986+
unsigned ZMask = Imm & 15;
4987+
unsigned DstIdx = (Imm >> 4) & 3;
4988+
unsigned SrcIdx = (Imm >> 6) & 3;
4989+
4990+
unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
4991+
if (Size <= RCSize && 4 <= Align) {
4992+
int PtrOffset = SrcIdx * 4;
4993+
unsigned NewImm = (DstIdx << 4) | ZMask;
4994+
unsigned NewOpCode =
4995+
(MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
4996+
: X86::INSERTPSrm);
4997+
MachineInstr *NewMI =
4998+
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
4999+
NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
5000+
return NewMI;
5001+
}
5002+
}
5003+
break;
5004+
};
5005+
5006+
return nullptr;
5007+
}
5008+
49515009
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
49525010
MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
49535011
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
@@ -4977,6 +5035,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
49775035
return nullptr;
49785036

49795037
MachineInstr *NewMI = nullptr;
5038+
5039+
// Attempt to fold any custom cases we have.
5040+
if (NewMI =
5041+
foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
5042+
return NewMI;
5043+
49805044
// Folding a memory location into the two-address part of a two-address
49815045
// instruction is different than folding it other places. It requires
49825046
// replacing the *two* registers with the memory location.

llvm/lib/Target/X86/X86InstrInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,14 @@ class X86InstrInfo final : public X86GenInstrInfo {
512512
MachineBasicBlock::iterator &MBBI,
513513
LiveVariables *LV) const;
514514

515+
/// Handles memory folding for special case instructions, for instance those
516+
/// requiring custom manipulation of the address.
517+
MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
518+
unsigned OpNum,
519+
ArrayRef<MachineOperand> MOs,
520+
MachineBasicBlock::iterator InsertPt,
521+
unsigned Size, unsigned Align) const;
522+
515523
/// isFrameOperand - Return true and the FrameIndex if the specified
516524
/// operand and follow operands form a reference to the stack frame.
517525
bool isFrameOperand(const MachineInstr *MI, unsigned int Op,

llvm/test/CodeGen/X86/avx.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
3232
; On X32, account for the argument's move to registers
3333
; X32: movl 4(%esp), %eax
3434
; CHECK-NOT: mov
35-
; CHECK: insertps $48
35+
; CHECK: vinsertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
3636
; CHECK-NEXT: ret
3737
%1 = load <4 x float>, <4 x float>* %pb, align 16
3838
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -46,7 +46,7 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
4646
; X32: movl 4(%esp), %eax
4747
; CHECK-NOT: mov
4848
;; Try to match a bit more of the instr, since we need the load's offset.
49-
; CHECK: insertps $96, 4(%{{...}}), %
49+
; CHECK: vinsertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
5050
; CHECK-NEXT: ret
5151
%1 = load <4 x float>, <4 x float>* %pb, align 16
5252
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
6060
; X32: movl 8(%esp), %ecx
6161
; CHECK-NOT: mov
6262
;; Try to match a bit more of the instr, since we need the load's offset.
63-
; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
63+
; CHECK: vinsertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
6464
; CHECK-NEXT: ret
6565
%1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
6666
%2 = load <4 x float>, <4 x float>* %1, align 16
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32
2+
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64
3+
4+
; Test for case where insertps was folding the load of the insertion element, but a later optimization
5+
; was then manipulating the load.
6+
7+
define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) {
8+
; X32-LABEL: insertps_unfold:
9+
; X32: # BB#0:
10+
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
11+
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
12+
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
13+
; X32-NEXT: movaps (%eax), %xmm0
14+
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
15+
; X32-NEXT: addps %xmm1, %xmm0
16+
; X32-NEXT: retl
17+
;
18+
; X64-LABEL: insertps_unfold:
19+
; X64: # BB#0:
20+
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
21+
; X64-NEXT: movaps (%rdi), %xmm0
22+
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
23+
; X64-NEXT: addps %xmm1, %xmm0
24+
; X64-NEXT: retq
25+
%a = getelementptr inbounds <4 x float>, <4 x float>* %v1, i64 0, i64 1
26+
%b = load float, float* %a, align 4
27+
%c = insertelement <4 x float> undef, float %b, i32 0
28+
%d = load <4 x float>, <4 x float>* %v1, align 16
29+
%e = load <4 x float>, <4 x float>* %v0, align 16
30+
%f = shufflevector <4 x float> %e, <4 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
31+
%g = fadd <4 x float> %c, %f
32+
ret <4 x float> %g
33+
}

llvm/test/CodeGen/X86/sse41.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -794,12 +794,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
794794
; X32-LABEL: insertps_from_vector_load:
795795
; X32: ## BB#0:
796796
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
797-
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
797+
; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
798798
; X32-NEXT: retl
799799
;
800800
; X64-LABEL: insertps_from_vector_load:
801801
; X64: ## BB#0:
802-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
802+
; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
803803
; X64-NEXT: retq
804804
%1 = load <4 x float>, <4 x float>* %pb, align 16
805805
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -812,12 +812,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
812812
; X32-LABEL: insertps_from_vector_load_offset:
813813
; X32: ## BB#0:
814814
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
815-
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
815+
; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
816816
; X32-NEXT: retl
817817
;
818818
; X64-LABEL: insertps_from_vector_load_offset:
819819
; X64: ## BB#0:
820-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
820+
; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
821821
; X64-NEXT: retq
822822
%1 = load <4 x float>, <4 x float>* %pb, align 16
823823
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -831,13 +831,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
831831
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
832832
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
833833
; X32-NEXT: shll $4, %ecx
834-
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
834+
; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
835835
; X32-NEXT: retl
836836
;
837837
; X64-LABEL: insertps_from_vector_load_offset_2:
838838
; X64: ## BB#0:
839839
; X64-NEXT: shlq $4, %rsi
840-
; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
840+
; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
841841
; X64-NEXT: retq
842842
%1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
843843
%2 = load <4 x float>, <4 x float>* %1, align 16
@@ -968,12 +968,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
968968
; X32-LABEL: pr20087:
969969
; X32: ## BB#0:
970970
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
971-
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
971+
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
972972
; X32-NEXT: retl
973973
;
974974
; X64-LABEL: pr20087:
975975
; X64: ## BB#0:
976-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
976+
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
977977
; X64-NEXT: retq
978978
%load = load <4 x float> , <4 x float> *%ptr
979979
%ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>

llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -946,7 +946,15 @@ define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
946946
ret <8 x float> %2
947947
}
948948

949-
; TODO stack_fold_insertps
949+
define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
950+
;CHECK-LABEL: stack_fold_insertps
951+
;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
952+
;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
953+
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
954+
%2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
955+
ret <4 x float> %2
956+
}
957+
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
950958

951959
define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
952960
;CHECK-LABEL: stack_fold_maxpd

llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,15 @@ define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
637637
}
638638
declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
639639

640-
; TODO stack_fold_insertps
640+
define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
641+
;CHECK-LABEL: stack_fold_insertps
642+
;CHECK: insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
643+
;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
644+
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
645+
%2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
646+
ret <4 x float> %2
647+
}
648+
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
641649

642650
define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
643651
;CHECK-LABEL: stack_fold_maxpd

0 commit comments

Comments
 (0)