Skip to content

Commit ddcb813

Browse files
committed
Refactor to match memset (NFC)
Refactored the memset and memcpy codegen to share the alignment-determining code.
1 parent 487686b commit ddcb813

File tree

1 file changed

+119
-98
lines changed

1 file changed

+119
-98
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

Lines changed: 119 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,23 @@ static cl::opt<bool>
2828
UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
2929
cl::desc("Use fast short rep mov in memcpy lowering"));
3030

31+
/// Returns the best type to use with repmovs/repstos depending on alignment.
32+
static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
33+
uint64_t Align = Alignment.value();
34+
assert((Align != 0) && "Align is normalized");
35+
assert(isPowerOf2_64(Align) && "Align is a power of 2");
36+
switch (Align) {
37+
case 1:
38+
return MVT::i8;
39+
case 2:
40+
return MVT::i16;
41+
case 4:
42+
return MVT::i32;
43+
default:
44+
return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
45+
}
46+
}
47+
3148
bool X86SelectionDAGInfo::isBaseRegConflictPossible(
3249
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
3350
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -44,102 +61,99 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
4461
return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
4562
}
4663

47-
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
48-
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49-
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
50-
MachinePointerInfo DstPtrInfo) const {
51-
// If to a segment-relative address space, use the default lowering.
52-
if (DstPtrInfo.getAddrSpace() >= 256)
53-
return SDValue();
64+
/// Emit a single REP STOSB instruction for a particular constant size.
65+
static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
66+
const SDLoc &dl, SDValue Chain, SDValue Dst,
67+
SDValue Val, SDValue Size, MVT AVT) {
68+
const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
69+
unsigned AX = X86::AL;
70+
switch (AVT.getSizeInBits()) {
71+
case 8:
72+
AX = X86::AL;
73+
break;
74+
case 16:
75+
AX = X86::AX;
76+
break;
77+
case 32:
78+
AX = X86::EAX;
79+
break;
80+
default:
81+
AX = X86::RAX;
82+
break;
83+
}
5484

55-
// If the base register might conflict with our physical registers, bail out.
56-
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57-
X86::ECX, X86::EAX, X86::EDI};
58-
if (isBaseRegConflictPossible(DAG, ClobberSet))
59-
return SDValue();
85+
const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86+
const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
6087

61-
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
62-
const X86Subtarget &Subtarget =
63-
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
88+
SDValue InGlue;
89+
Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
90+
InGlue = Chain.getValue(1);
91+
Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
92+
InGlue = Chain.getValue(1);
93+
Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
94+
InGlue = Chain.getValue(1);
95+
96+
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
97+
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
98+
return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
99+
}
100+
101+
/// Returns a REP STOS instruction, possibly with a few load/stores to implement
102+
/// a constant size memory set. In some cases where we know REP MOVS is
103+
/// inefficient we return an empty SDValue so the calling code can either
104+
/// generate a store sequence or call the runtime memset function.
105+
static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
106+
const X86Subtarget &Subtarget,
107+
const SDLoc &dl, SDValue Chain,
108+
SDValue Dst, SDValue Val, uint64_t Size,
109+
EVT SizeVT, Align Alignment,
110+
bool isVolatile, bool AlwaysInline,
111+
MachinePointerInfo DstPtrInfo) {
112+
113+
if (Size > Subtarget.getMaxInlineSizeThreshold())
114+
return SDValue();
64115

65116
// If not DWORD aligned or size is more than the threshold, call the library.
66117
// The libc version is likely to be faster for these cases. It can use the
67118
// address value and run time information about the CPU.
68-
if (Alignment < Align(4) || !ConstantSize ||
69-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
119+
if (Alignment < Align(4))
70120
return SDValue();
71121

72-
uint64_t SizeVal = ConstantSize->getZExtValue();
73-
SDValue InGlue;
74-
EVT AVT;
75-
SDValue Count;
76-
unsigned BytesLeft = 0;
122+
MVT BlockType = MVT::i8;
123+
uint64_t BlockCount = Size;
124+
uint64_t BytesLeft = 0;
77125
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
78-
unsigned ValReg;
79-
uint64_t Val = ValC->getZExtValue() & 255;
80-
81-
// If the value is a constant, then we can potentially use larger sets.
82-
if (Alignment >= Align(4)) {
83-
// DWORD aligned
84-
AVT = MVT::i32;
85-
ValReg = X86::EAX;
86-
Val = (Val << 8) | Val;
87-
Val = (Val << 16) | Val;
88-
if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
89-
AVT = MVT::i64;
90-
ValReg = X86::RAX;
91-
Val = (Val << 32) | Val;
92-
}
93-
} else if (Alignment == Align(2)) {
94-
// WORD aligned
95-
AVT = MVT::i16;
96-
ValReg = X86::AX;
97-
Val = (Val << 8) | Val;
98-
} else {
99-
// Byte aligned
100-
AVT = MVT::i8;
101-
ValReg = X86::AL;
102-
Count = DAG.getIntPtrConstant(SizeVal, dl);
103-
}
104-
105-
if (AVT.bitsGT(MVT::i8)) {
106-
unsigned UBytes = AVT.getSizeInBits() / 8;
107-
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
108-
BytesLeft = SizeVal % UBytes;
109-
}
110-
111-
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
112-
InGlue);
113-
InGlue = Chain.getValue(1);
114-
} else {
115-
AVT = MVT::i8;
116-
Count = DAG.getIntPtrConstant(SizeVal, dl);
117-
Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InGlue);
118-
InGlue = Chain.getValue(1);
119-
}
126+
BlockType = getOptimalRepType(Subtarget, Alignment);
127+
uint64_t Value = ValC->getZExtValue() & 255;
128+
const uint64_t BlockBits = BlockType.getSizeInBits();
120129

121-
bool Use64BitRegs = Subtarget.isTarget64BitLP64();
122-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123-
Count, InGlue);
124-
InGlue = Chain.getValue(1);
125-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126-
Dst, InGlue);
127-
InGlue = Chain.getValue(1);
130+
if (BlockBits >= 16)
131+
Value = (Value << 8) | Value;
128132

129-
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
130-
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
131-
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
133+
if (BlockBits >= 32)
134+
Value = (Value << 16) | Value;
135+
136+
if (BlockBits >= 64)
137+
Value = (Value << 32) | Value;
138+
139+
const uint64_t BlockBytes = BlockBits / 8;
140+
BlockCount = Size / BlockBytes;
141+
BytesLeft = Size % BlockBytes;
142+
Val = DAG.getConstant(Value, dl, BlockType);
143+
}
132144

145+
SDValue RepStos =
146+
emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
147+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
133148
/// RepStos can process the whole length.
134149
if (BytesLeft == 0)
135150
return RepStos;
136151

137152
// Handle the last 1 - 7 bytes.
138153
SmallVector<SDValue, 4> Results;
139154
Results.push_back(RepStos);
140-
unsigned Offset = SizeVal - BytesLeft;
155+
unsigned Offset = Size - BytesLeft;
141156
EVT AddrVT = Dst.getValueType();
142-
EVT SizeVT = Size.getValueType();
143157

144158
Results.push_back(
145159
DAG.getMemset(Chain, dl,
@@ -152,6 +166,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
152166
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
153167
}
154168

169+
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
170+
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
171+
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
172+
MachinePointerInfo DstPtrInfo) const {
173+
// If to a segment-relative address space, use the default lowering.
174+
if (DstPtrInfo.getAddrSpace() >= 256)
175+
return SDValue();
176+
177+
// If the base register might conflict with our physical registers, bail out.
178+
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
179+
X86::ECX, X86::EAX, X86::EDI};
180+
if (isBaseRegConflictPossible(DAG, ClobberSet))
181+
return SDValue();
182+
183+
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
184+
if (!ConstantSize)
185+
return SDValue();
186+
187+
const X86Subtarget &Subtarget =
188+
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
189+
return emitConstantSizeRepstos(
190+
DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
191+
Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
192+
}
193+
155194
/// Emit a single REP MOVS{B,W,D,Q} instruction.
156195
static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
157196
const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -182,24 +221,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
182221
DAG.getIntPtrConstant(Size, dl), MVT::i8);
183222
}
184223

185-
/// Returns the best type to use with repmovs depending on alignment.
186-
static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
187-
Align Alignment) {
188-
uint64_t Align = Alignment.value();
189-
assert((Align != 0) && "Align is normalized");
190-
assert(isPowerOf2_64(Align) && "Align is a power of 2");
191-
switch (Align) {
192-
case 1:
193-
return MVT::i8;
194-
case 2:
195-
return MVT::i16;
196-
case 4:
197-
return MVT::i32;
198-
default:
199-
return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
200-
}
201-
}
202-
203224
/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
204225
/// a constant size memory copy. In some cases where we know REP MOVS is
205226
/// inefficient we return an empty SDValue so the calling code can either
@@ -222,10 +243,10 @@ static SDValue emitConstantSizeRepmov(
222243
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
223244
/// We assume runtime memcpy will do a better job for unaligned copies when
224245
/// ERMS is not present.
225-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
246+
if (!AlwaysInline && (Alignment < Align(4)))
226247
return SDValue();
227248

228-
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
249+
const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
229250
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
230251
const uint64_t BlockCount = Size / BlockBytes;
231252
const uint64_t BytesLeft = Size % BlockBytes;
@@ -239,7 +260,7 @@ static SDValue emitConstantSizeRepmov(
239260

240261
assert(BytesLeft && "We have leftover at this point");
241262

242-
/// In case we optimize for size we use repmovsb even if it's less efficient
263+
/// In case we optimize for size, we use repmovsb even if it's less efficient
243264
/// so we can save the loads/stores of the leftover.
244265
if (DAG.getMachineFunction().getFunction().hasMinSize())
245266
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
@@ -282,7 +303,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
282303
if (UseFSRMForMemcpy && Subtarget.hasFSRM())
283304
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
284305

285-
/// Handle constant sizes,
306+
/// Handle constant sizes
286307
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
287308
return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
288309
ConstantSize->getZExtValue(),

0 commit comments

Comments
 (0)