Skip to content

Commit 84a1f07

Browse files
committed
[X86][AMDGPU][DAGCombiner] Move call to allowsMemoryAccess into isLoadBitCastBeneficial/isStoreBitCastBeneficial to allow X86 to bypass it
Basically the problem is that X86 doesn't set the Fast flag from allowsMemoryAccess on certain CPUs due to slow unaligned memory subtarget features. This prevents bitcasts from being folded into loads and stores. But all vector loads and stores of the same width are the same cost on X86. This patch merges the allowsMemoryAccess call into isLoadBitCastBeneficial to allow X86 to skip it. Differential Revision: https://reviews.llvm.org/D64295 llvm-svn: 365549
1 parent c236eea commit 84a1f07

File tree

8 files changed

+59
-51
lines changed

8 files changed

+59
-51
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

+10-5
Original file line numberDiff line numberDiff line change
@@ -401,8 +401,9 @@ class TargetLoweringBase {
401401
/// efficiently, casting the load to a smaller vector of larger types and
402402
/// loading is more efficient, however, this can be undone by optimizations in
403403
/// dag combiner.
404-
virtual bool isLoadBitCastBeneficial(EVT LoadVT,
405-
EVT BitcastVT) const {
404+
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
405+
const SelectionDAG &DAG,
406+
const MachineMemOperand &MMO) const {
406407
// Don't do if we could do an indexed load on the original type, but not on
407408
// the new one.
408409
if (!LoadVT.isSimple() || !BitcastVT.isSimple())
@@ -416,14 +417,18 @@ class TargetLoweringBase {
416417
getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
417418
return false;
418419

419-
return true;
420+
bool Fast = false;
421+
return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT,
422+
MMO, &Fast) && Fast;
420423
}
421424

422425
/// Return true if the following transform is beneficial:
423426
/// (store (y (conv x)), y*)) -> (store x, (x*))
424-
virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const {
427+
virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT,
428+
const SelectionDAG &DAG,
429+
const MachineMemOperand &MMO) const {
425430
// Default to the same logic as loads.
426-
return isLoadBitCastBeneficial(StoreVT, BitcastVT);
431+
return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO);
427432
}
428433

429434
/// Return true if it is expected to be cheaper to do a store of a non-zero

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+8-15
Original file line numberDiff line numberDiff line change
@@ -11040,14 +11040,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
1104011040
// as we assume software couldn't rely on the number of accesses of an
1104111041
// illegal type.
1104211042
((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
11043-
TLI.isOperationLegal(ISD::LOAD, VT)) &&
11044-
TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
11043+
TLI.isOperationLegal(ISD::LOAD, VT))) {
1104511044
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
1104611045

11047-
bool Fast = false;
11048-
if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
11049-
*LN0->getMemOperand(), &Fast) &&
11050-
Fast) {
11046+
if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
11047+
*LN0->getMemOperand())) {
1105111048
SDValue Load =
1105211049
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
1105311050
LN0->getPointerInfo(), LN0->getAlignment(),
@@ -16174,15 +16171,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
1617416171
// illegal type.
1617516172
if (((!LegalOperations && !ST->isVolatile()) ||
1617616173
TLI.isOperationLegal(ISD::STORE, SVT)) &&
16177-
TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
16178-
bool Fast = false;
16179-
if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
16180-
*ST->getMemOperand(), &Fast) &&
16181-
Fast) {
16182-
return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
16183-
ST->getPointerInfo(), ST->getAlignment(),
16184-
ST->getMemOperand()->getFlags(), ST->getAAInfo());
16185-
}
16174+
TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
16175+
DAG, *ST->getMemOperand())) {
16176+
return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
16177+
ST->getPointerInfo(), ST->getAlignment(),
16178+
ST->getMemOperand()->getFlags(), ST->getAAInfo());
1618616179
}
1618716180
}
1618816181

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+9-4
Original file line numberDiff line numberDiff line change
@@ -719,8 +719,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
719719
return (OldSize < 32);
720720
}
721721

722-
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
723-
EVT CastTy) const {
722+
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
723+
const SelectionDAG &DAG,
724+
const MachineMemOperand &MMO) const {
724725

725726
assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
726727

@@ -730,8 +731,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
730731
unsigned LScalarSize = LoadTy.getScalarSizeInBits();
731732
unsigned CastScalarSize = CastTy.getScalarSizeInBits();
732733

733-
return (LScalarSize < CastScalarSize) ||
734-
(CastScalarSize >= 32);
734+
if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
735+
return false;
736+
737+
bool Fast = false;
738+
return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
739+
MMO, &Fast) && Fast;
735740
}
736741

737742
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ class AMDGPUTargetLowering : public TargetLowering {
182182
ISD::LoadExtType ExtType,
183183
EVT ExtVT) const override;
184184

185-
bool isLoadBitCastBeneficial(EVT, EVT) const final;
185+
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
186+
const MachineMemOperand &MMO) const final;
186187

187188
bool storeOfVectorConstantIsCheap(EVT MemVT,
188189
unsigned NumElem,

llvm/lib/Target/X86/X86ISelLowering.cpp

+9-3
Original file line numberDiff line numberDiff line change
@@ -4941,16 +4941,22 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
49414941
return Subtarget.hasLZCNT();
49424942
}
49434943

4944-
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4945-
EVT BitcastVT) const {
4944+
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
4945+
const SelectionDAG &DAG,
4946+
const MachineMemOperand &MMO) const {
49464947
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
49474948
BitcastVT.getVectorElementType() == MVT::i1)
49484949
return false;
49494950

49504951
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
49514952
return false;
49524953

4953-
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4954+
// If both types are legal vectors, it's always ok to convert them.
4955+
if (LoadVT.isVector() && BitcastVT.isVector() &&
4956+
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
4957+
return true;
4958+
4959+
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
49544960
}
49554961

49564962
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

llvm/lib/Target/X86/X86ISelLowering.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -1127,7 +1127,9 @@ namespace llvm {
11271127
return NumElem > 2;
11281128
}
11291129

1130-
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
1130+
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
1131+
const SelectionDAG &DAG,
1132+
const MachineMemOperand &MMO) const override;
11311133

11321134
/// Intel processors have a unified instruction and data cache
11331135
const char * getClearCacheBuiltinName() const override {

llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll

+10-14
Original file line numberDiff line numberDiff line change
@@ -306,27 +306,25 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
306306
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
307307
; X86-SSE2-NEXT: movd %xmm0, %ecx
308308
; X86-SSE2-NEXT: movntil %ecx, (%eax)
309-
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
310-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
309+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
311310
; X86-SSE2-NEXT: movd %xmm2, %ecx
312311
; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
313312
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
314313
; X86-SSE2-NEXT: movd %xmm2, %ecx
315314
; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
316-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
315+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
317316
; X86-SSE2-NEXT: movd %xmm0, %ecx
318317
; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
319318
; X86-SSE2-NEXT: movd %xmm1, %ecx
320319
; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
321-
; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
322-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
320+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
323321
; X86-SSE2-NEXT: movd %xmm0, %ecx
324322
; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
325323
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
326324
; X86-SSE2-NEXT: movd %xmm0, %ecx
327325
; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
328-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
329-
; X86-SSE2-NEXT: movd %xmm1, %ecx
326+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
327+
; X86-SSE2-NEXT: movd %xmm0, %ecx
330328
; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
331329
; X86-SSE2-NEXT: retl
332330
;
@@ -421,27 +419,25 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
421419
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
422420
; X86-SSE2-NEXT: movd %xmm0, %ecx
423421
; X86-SSE2-NEXT: movntil %ecx, (%eax)
424-
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
425-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
422+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
426423
; X86-SSE2-NEXT: movd %xmm2, %ecx
427424
; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
428425
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
429426
; X86-SSE2-NEXT: movd %xmm2, %ecx
430427
; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
431-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
428+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
432429
; X86-SSE2-NEXT: movd %xmm0, %ecx
433430
; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
434431
; X86-SSE2-NEXT: movd %xmm1, %ecx
435432
; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
436-
; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
437-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
433+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
438434
; X86-SSE2-NEXT: movd %xmm0, %ecx
439435
; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
440436
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
441437
; X86-SSE2-NEXT: movd %xmm0, %ecx
442438
; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
443-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
444-
; X86-SSE2-NEXT: movd %xmm1, %ecx
439+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
440+
; X86-SSE2-NEXT: movd %xmm0, %ecx
445441
; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
446442
; X86-SSE2-NEXT: retl
447443
;

llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
7-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
2+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
7+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9+
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
1010

1111
define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
1212
; SSE-LABEL: shuffle_v4i32_0001:

0 commit comments

Comments
 (0)