Skip to content

Commit 3cf8535

Browse files
authored
[X86][BF16] Improve vectorization of BF16 (#88486)
1. Move expansion to combineFP_EXTEND to help with small vectors; 2. Combine FP_ROUND to reduce fptrunc then fpextend after promotion;
1 parent 37ebf2a commit 3cf8535

File tree

3 files changed

+66
-229
lines changed

3 files changed

+66
-229
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21433,25 +21433,9 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
2143321433
return Res;
2143421434
}
2143521435

21436-
if (!SVT.isVector())
21436+
if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
2143721437
return Op;
2143821438

21439-
if (SVT.getVectorElementType() == MVT::bf16) {
21440-
// FIXME: Do we need to support strict FP?
21441-
assert(!IsStrict && "Strict FP doesn't support BF16");
21442-
if (VT.getVectorElementType() == MVT::f64) {
21443-
MVT TmpVT = VT.changeVectorElementType(MVT::f32);
21444-
return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21445-
DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In));
21446-
}
21447-
assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
21448-
MVT NVT = SVT.changeVectorElementType(MVT::i32);
21449-
In = DAG.getBitcast(SVT.changeTypeToInteger(), In);
21450-
In = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, In);
21451-
In = DAG.getNode(ISD::SHL, DL, NVT, In, DAG.getConstant(16, DL, NVT));
21452-
return DAG.getBitcast(VT, In);
21453-
}
21454-
2145521439
if (SVT.getVectorElementType() == MVT::f16) {
2145621440
if (Subtarget.hasFP16() && isTypeLegal(SVT))
2145721441
return Op;
@@ -56517,17 +56501,40 @@ static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
5651756501

5651856502
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
5651956503
const X86Subtarget &Subtarget) {
56504+
EVT VT = N->getValueType(0);
56505+
bool IsStrict = N->isStrictFPOpcode();
56506+
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56507+
EVT SrcVT = Src.getValueType();
56508+
56509+
SDLoc dl(N);
56510+
if (SrcVT.getScalarType() == MVT::bf16) {
56511+
if (!IsStrict && Src.getOpcode() == ISD::FP_ROUND &&
56512+
Src.getOperand(0).getValueType() == VT)
56513+
return Src.getOperand(0);
56514+
56515+
if (!SrcVT.isVector())
56516+
return SDValue();
56517+
56518+
assert(!IsStrict && "Strict FP doesn't support BF16");
56519+
if (VT.getVectorElementType() == MVT::f64) {
56520+
MVT TmpVT = VT.getSimpleVT().changeVectorElementType(MVT::f32);
56521+
return DAG.getNode(ISD::FP_EXTEND, dl, VT,
56522+
DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
56523+
}
56524+
assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
56525+
MVT NVT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i32);
56526+
Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
56527+
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
56528+
Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
56529+
return DAG.getBitcast(VT, Src);
56530+
}
56531+
5652056532
if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
5652156533
return SDValue();
5652256534

5652356535
if (Subtarget.hasFP16())
5652456536
return SDValue();
5652556537

56526-
bool IsStrict = N->isStrictFPOpcode();
56527-
EVT VT = N->getValueType(0);
56528-
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56529-
EVT SrcVT = Src.getValueType();
56530-
5653156538
if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
5653256539
return SDValue();
5653356540

@@ -56539,8 +56546,6 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
5653956546
if (NumElts == 1 || !isPowerOf2_32(NumElts))
5654056547
return SDValue();
5654156548

56542-
SDLoc dl(N);
56543-
5654456549
// Convert the input to vXi16.
5654556550
EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5654656551
Src = DAG.getBitcast(IntVT, Src);

llvm/test/CodeGen/X86/bfloat.ll

Lines changed: 33 additions & 201 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,22 +1629,8 @@ define <4 x float> @pr64460_1(<4 x bfloat> %a) {
16291629
;
16301630
; SSE2-LABEL: pr64460_1:
16311631
; SSE2: # %bb.0:
1632-
; SSE2-NEXT: pextrw $1, %xmm0, %eax
1633-
; SSE2-NEXT: shll $16, %eax
1634-
; SSE2-NEXT: movd %eax, %xmm2
1635-
; SSE2-NEXT: movd %xmm0, %eax
1636-
; SSE2-NEXT: shll $16, %eax
1637-
; SSE2-NEXT: movd %eax, %xmm1
1638-
; SSE2-NEXT: pextrw $3, %xmm0, %eax
1639-
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1640-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1641-
; SSE2-NEXT: shll $16, %eax
1642-
; SSE2-NEXT: movd %eax, %xmm2
1643-
; SSE2-NEXT: movd %xmm0, %eax
1644-
; SSE2-NEXT: shll $16, %eax
1645-
; SSE2-NEXT: movd %eax, %xmm0
1646-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1647-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1632+
; SSE2-NEXT: pxor %xmm1, %xmm1
1633+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
16481634
; SSE2-NEXT: movdqa %xmm1, %xmm0
16491635
; SSE2-NEXT: retq
16501636
;
@@ -1666,41 +1652,11 @@ define <8 x float> @pr64460_2(<8 x bfloat> %a) {
16661652
;
16671653
; SSE2-LABEL: pr64460_2:
16681654
; SSE2: # %bb.0:
1669-
; SSE2-NEXT: movq %xmm0, %rdx
1670-
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1671-
; SSE2-NEXT: movq %xmm0, %rcx
1672-
; SSE2-NEXT: movq %rcx, %rax
1673-
; SSE2-NEXT: shrq $32, %rax
1674-
; SSE2-NEXT: movq %rdx, %rsi
1675-
; SSE2-NEXT: shrq $32, %rsi
1676-
; SSE2-NEXT: movl %edx, %edi
1677-
; SSE2-NEXT: andl $-65536, %edi # imm = 0xFFFF0000
1678-
; SSE2-NEXT: movd %edi, %xmm1
1679-
; SSE2-NEXT: movl %edx, %edi
1680-
; SSE2-NEXT: shll $16, %edi
1681-
; SSE2-NEXT: movd %edi, %xmm0
1682-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1683-
; SSE2-NEXT: shrq $48, %rdx
1684-
; SSE2-NEXT: shll $16, %edx
1685-
; SSE2-NEXT: movd %edx, %xmm1
1686-
; SSE2-NEXT: shll $16, %esi
1687-
; SSE2-NEXT: movd %esi, %xmm2
1688-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1689-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1690-
; SSE2-NEXT: movl %ecx, %edx
1691-
; SSE2-NEXT: andl $-65536, %edx # imm = 0xFFFF0000
1692-
; SSE2-NEXT: movd %edx, %xmm2
1693-
; SSE2-NEXT: movl %ecx, %edx
1694-
; SSE2-NEXT: shll $16, %edx
1695-
; SSE2-NEXT: movd %edx, %xmm1
1696-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1697-
; SSE2-NEXT: shrq $48, %rcx
1698-
; SSE2-NEXT: shll $16, %ecx
1699-
; SSE2-NEXT: movd %ecx, %xmm2
1700-
; SSE2-NEXT: shll $16, %eax
1701-
; SSE2-NEXT: movd %eax, %xmm3
1702-
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1703-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1655+
; SSE2-NEXT: pxor %xmm1, %xmm1
1656+
; SSE2-NEXT: pxor %xmm2, %xmm2
1657+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1658+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1659+
; SSE2-NEXT: movdqa %xmm2, %xmm0
17041660
; SSE2-NEXT: retq
17051661
;
17061662
; AVX-LABEL: pr64460_2:
@@ -1721,76 +1677,16 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) {
17211677
;
17221678
; SSE2-LABEL: pr64460_3:
17231679
; SSE2: # %bb.0:
1724-
; SSE2-NEXT: movq %xmm1, %rdi
1725-
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
1726-
; SSE2-NEXT: movq %xmm1, %rcx
1727-
; SSE2-NEXT: movq %rcx, %rax
1728-
; SSE2-NEXT: shrq $32, %rax
1729-
; SSE2-NEXT: movq %xmm0, %r9
1730-
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1731-
; SSE2-NEXT: movq %xmm0, %rsi
1732-
; SSE2-NEXT: movq %rsi, %rdx
1733-
; SSE2-NEXT: shrq $32, %rdx
1734-
; SSE2-NEXT: movq %rdi, %r8
1735-
; SSE2-NEXT: shrq $32, %r8
1736-
; SSE2-NEXT: movq %r9, %r10
1737-
; SSE2-NEXT: shrq $32, %r10
1738-
; SSE2-NEXT: movl %r9d, %r11d
1739-
; SSE2-NEXT: andl $-65536, %r11d # imm = 0xFFFF0000
1740-
; SSE2-NEXT: movd %r11d, %xmm1
1741-
; SSE2-NEXT: movl %r9d, %r11d
1742-
; SSE2-NEXT: shll $16, %r11d
1743-
; SSE2-NEXT: movd %r11d, %xmm0
1744-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1745-
; SSE2-NEXT: shrq $48, %r9
1746-
; SSE2-NEXT: shll $16, %r9d
1747-
; SSE2-NEXT: movd %r9d, %xmm1
1748-
; SSE2-NEXT: shll $16, %r10d
1749-
; SSE2-NEXT: movd %r10d, %xmm2
1750-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1751-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1752-
; SSE2-NEXT: movl %edi, %r9d
1753-
; SSE2-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000
1754-
; SSE2-NEXT: movd %r9d, %xmm1
1755-
; SSE2-NEXT: movl %edi, %r9d
1756-
; SSE2-NEXT: shll $16, %r9d
1757-
; SSE2-NEXT: movd %r9d, %xmm2
1758-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1759-
; SSE2-NEXT: shrq $48, %rdi
1760-
; SSE2-NEXT: shll $16, %edi
1761-
; SSE2-NEXT: movd %edi, %xmm1
1762-
; SSE2-NEXT: shll $16, %r8d
1763-
; SSE2-NEXT: movd %r8d, %xmm3
1764-
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1765-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1766-
; SSE2-NEXT: movl %esi, %edi
1767-
; SSE2-NEXT: andl $-65536, %edi # imm = 0xFFFF0000
1768-
; SSE2-NEXT: movd %edi, %xmm3
1769-
; SSE2-NEXT: movl %esi, %edi
1770-
; SSE2-NEXT: shll $16, %edi
1771-
; SSE2-NEXT: movd %edi, %xmm1
1772-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1773-
; SSE2-NEXT: shrq $48, %rsi
1774-
; SSE2-NEXT: shll $16, %esi
1775-
; SSE2-NEXT: movd %esi, %xmm3
1776-
; SSE2-NEXT: shll $16, %edx
1777-
; SSE2-NEXT: movd %edx, %xmm4
1778-
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1779-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
1780-
; SSE2-NEXT: movl %ecx, %edx
1781-
; SSE2-NEXT: andl $-65536, %edx # imm = 0xFFFF0000
1782-
; SSE2-NEXT: movd %edx, %xmm4
1783-
; SSE2-NEXT: movl %ecx, %edx
1784-
; SSE2-NEXT: shll $16, %edx
1785-
; SSE2-NEXT: movd %edx, %xmm3
1786-
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1787-
; SSE2-NEXT: shrq $48, %rcx
1788-
; SSE2-NEXT: shll $16, %ecx
1789-
; SSE2-NEXT: movd %ecx, %xmm4
1790-
; SSE2-NEXT: shll $16, %eax
1791-
; SSE2-NEXT: movd %eax, %xmm5
1792-
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1793-
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
1680+
; SSE2-NEXT: pxor %xmm3, %xmm3
1681+
; SSE2-NEXT: pxor %xmm5, %xmm5
1682+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1683+
; SSE2-NEXT: pxor %xmm4, %xmm4
1684+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1685+
; SSE2-NEXT: pxor %xmm2, %xmm2
1686+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1687+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1688+
; SSE2-NEXT: movdqa %xmm5, %xmm0
1689+
; SSE2-NEXT: movdqa %xmm4, %xmm1
17941690
; SSE2-NEXT: retq
17951691
;
17961692
; F16-LABEL: pr64460_3:
@@ -1822,47 +1718,17 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
18221718
;
18231719
; SSE2-LABEL: pr64460_4:
18241720
; SSE2: # %bb.0:
1825-
; SSE2-NEXT: movq %xmm0, %rsi
1826-
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1827-
; SSE2-NEXT: movq %xmm0, %rdx
1828-
; SSE2-NEXT: movq %rdx, %rax
1829-
; SSE2-NEXT: shrq $32, %rax
1830-
; SSE2-NEXT: movq %rdx, %rcx
1831-
; SSE2-NEXT: shrq $48, %rcx
1832-
; SSE2-NEXT: movq %rsi, %rdi
1833-
; SSE2-NEXT: shrq $32, %rdi
1834-
; SSE2-NEXT: movq %rsi, %r8
1835-
; SSE2-NEXT: shrq $48, %r8
1836-
; SSE2-NEXT: movl %esi, %r9d
1837-
; SSE2-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000
1838-
; SSE2-NEXT: movd %r9d, %xmm0
1839-
; SSE2-NEXT: cvtss2sd %xmm0, %xmm1
1840-
; SSE2-NEXT: shll $16, %esi
1841-
; SSE2-NEXT: movd %esi, %xmm0
1842-
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
1843-
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1844-
; SSE2-NEXT: shll $16, %r8d
1845-
; SSE2-NEXT: movd %r8d, %xmm1
1846-
; SSE2-NEXT: cvtss2sd %xmm1, %xmm2
1847-
; SSE2-NEXT: shll $16, %edi
1848-
; SSE2-NEXT: movd %edi, %xmm1
1849-
; SSE2-NEXT: cvtss2sd %xmm1, %xmm1
1850-
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1851-
; SSE2-NEXT: movl %edx, %esi
1852-
; SSE2-NEXT: andl $-65536, %esi # imm = 0xFFFF0000
1853-
; SSE2-NEXT: movd %esi, %xmm2
1854-
; SSE2-NEXT: cvtss2sd %xmm2, %xmm3
1855-
; SSE2-NEXT: shll $16, %edx
1856-
; SSE2-NEXT: movd %edx, %xmm2
1857-
; SSE2-NEXT: cvtss2sd %xmm2, %xmm2
1858-
; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1859-
; SSE2-NEXT: shll $16, %ecx
1860-
; SSE2-NEXT: movd %ecx, %xmm3
1861-
; SSE2-NEXT: cvtss2sd %xmm3, %xmm4
1862-
; SSE2-NEXT: shll $16, %eax
1863-
; SSE2-NEXT: movd %eax, %xmm3
1864-
; SSE2-NEXT: cvtss2sd %xmm3, %xmm3
1865-
; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1721+
; SSE2-NEXT: pxor %xmm3, %xmm3
1722+
; SSE2-NEXT: pxor %xmm1, %xmm1
1723+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1724+
; SSE2-NEXT: cvtps2pd %xmm1, %xmm4
1725+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1726+
; SSE2-NEXT: cvtps2pd %xmm3, %xmm2
1727+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1728+
; SSE2-NEXT: cvtps2pd %xmm0, %xmm1
1729+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1730+
; SSE2-NEXT: cvtps2pd %xmm0, %xmm3
1731+
; SSE2-NEXT: movaps %xmm4, %xmm0
18661732
; SSE2-NEXT: retq
18671733
;
18681734
; F16-LABEL: pr64460_4:
@@ -1874,45 +1740,11 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
18741740
;
18751741
; AVXNC-LABEL: pr64460_4:
18761742
; AVXNC: # %bb.0:
1877-
; AVXNC-NEXT: vpextrw $3, %xmm0, %eax
1878-
; AVXNC-NEXT: shll $16, %eax
1879-
; AVXNC-NEXT: vmovd %eax, %xmm1
1880-
; AVXNC-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1881-
; AVXNC-NEXT: vpextrw $2, %xmm0, %eax
1882-
; AVXNC-NEXT: shll $16, %eax
1883-
; AVXNC-NEXT: vmovd %eax, %xmm2
1884-
; AVXNC-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1885-
; AVXNC-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1886-
; AVXNC-NEXT: vpextrw $1, %xmm0, %eax
1887-
; AVXNC-NEXT: shll $16, %eax
1888-
; AVXNC-NEXT: vmovd %eax, %xmm2
1889-
; AVXNC-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1890-
; AVXNC-NEXT: vmovd %xmm0, %eax
1891-
; AVXNC-NEXT: shll $16, %eax
1892-
; AVXNC-NEXT: vmovd %eax, %xmm3
1893-
; AVXNC-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1894-
; AVXNC-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1895-
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
1896-
; AVXNC-NEXT: vpextrw $7, %xmm0, %eax
1897-
; AVXNC-NEXT: shll $16, %eax
1898-
; AVXNC-NEXT: vmovd %eax, %xmm1
1899-
; AVXNC-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1900-
; AVXNC-NEXT: vpextrw $6, %xmm0, %eax
1901-
; AVXNC-NEXT: shll $16, %eax
1902-
; AVXNC-NEXT: vmovd %eax, %xmm3
1903-
; AVXNC-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1904-
; AVXNC-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
1905-
; AVXNC-NEXT: vpextrw $5, %xmm0, %eax
1906-
; AVXNC-NEXT: shll $16, %eax
1907-
; AVXNC-NEXT: vmovd %eax, %xmm3
1908-
; AVXNC-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1909-
; AVXNC-NEXT: vpextrw $4, %xmm0, %eax
1910-
; AVXNC-NEXT: shll $16, %eax
1911-
; AVXNC-NEXT: vmovd %eax, %xmm0
1912-
; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1913-
; AVXNC-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1914-
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1915-
; AVXNC-NEXT: vmovaps %ymm2, %ymm0
1743+
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1744+
; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
1745+
; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm0
1746+
; AVXNC-NEXT: vextracti128 $1, %ymm1, %xmm1
1747+
; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm1
19161748
; AVXNC-NEXT: retq
19171749
%b = fpext <8 x bfloat> %a to <8 x double>
19181750
ret <8 x double> %b

llvm/test/CodeGen/X86/concat-fpext-v2bf16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ define void @test(<2 x ptr> %ptr) {
1010
; CHECK-NEXT: # %bb.2: # %loop.127.preheader
1111
; CHECK-NEXT: retq
1212
; CHECK-NEXT: .LBB0_1: # %ifmerge.89
13-
; CHECK-NEXT: movzwl (%rax), %eax
14-
; CHECK-NEXT: shll $16, %eax
15-
; CHECK-NEXT: vmovd %eax, %xmm0
16-
; CHECK-NEXT: vmulss %xmm0, %xmm0, %xmm0
1713
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
14+
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
15+
; CHECK-NEXT: vpbroadcastw (%rax), %xmm2
16+
; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
17+
; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0
1818
; CHECK-NEXT: vmovlps %xmm0, (%rax)
1919
entry:
2020
br label %then.13

0 commit comments

Comments
 (0)