@@ -28,23 +28,6 @@ static cl::opt<bool>
28
28
UseFSRMForMemcpy (" x86-use-fsrm-for-memcpy" , cl::Hidden, cl::init(false ),
29
29
cl::desc(" Use fast short rep mov in memcpy lowering" ));
30
30
31
- // / Returns the best type to use with repmovs/repstos depending on alignment.
32
- static MVT getOptimalRepType (const X86Subtarget &Subtarget, Align Alignment) {
33
- uint64_t Align = Alignment.value ();
34
- assert ((Align != 0 ) && " Align is normalized" );
35
- assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
36
- switch (Align) {
37
- case 1 :
38
- return MVT::i8;
39
- case 2 :
40
- return MVT::i16;
41
- case 4 :
42
- return MVT::i32;
43
- default :
44
- return Subtarget.is64Bit () ? MVT::i64 : MVT::i32;
45
- }
46
- }
47
-
48
31
bool X86SelectionDAGInfo::isBaseRegConflictPossible (
49
32
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
50
33
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -61,127 +44,102 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
61
44
return llvm::is_contained (ClobberSet, TRI->getBaseRegister ());
62
45
}
63
46
64
- // / Emit a single REP STOSB instruction for a particular constant size.
65
- static SDValue emitRepstos (const X86Subtarget &Subtarget, SelectionDAG &DAG,
66
- const SDLoc &dl, SDValue Chain, SDValue Dst,
67
- SDValue Val, SDValue Size , MVT AVT) {
68
- const bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
69
- unsigned AX = X86::AL;
70
- switch (AVT.getSizeInBits ()) {
71
- case 8 :
72
- AX = X86::AL;
73
- break ;
74
- case 16 :
75
- AX = X86::AX;
76
- break ;
77
- case 32 :
78
- AX = X86::EAX;
79
- break ;
80
- default :
81
- AX = X86::RAX;
82
- break ;
83
- }
84
-
85
- const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86
- const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
87
-
88
- SDValue InGlue;
89
- Chain = DAG.getCopyToReg (Chain, dl, AX, Val, InGlue);
90
- InGlue = Chain.getValue (1 );
91
- Chain = DAG.getCopyToReg (Chain, dl, CX, Size , InGlue);
92
- InGlue = Chain.getValue (1 );
93
- Chain = DAG.getCopyToReg (Chain, dl, DI, Dst, InGlue);
94
- InGlue = Chain.getValue (1 );
95
-
96
- SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
97
- SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
98
- return DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
99
- }
100
-
101
- // / Emit a single REP STOSB instruction for a particular constant size.
102
- static SDValue emitRepstosB (const X86Subtarget &Subtarget, SelectionDAG &DAG,
103
- const SDLoc &dl, SDValue Chain, SDValue Dst,
104
- SDValue Val, uint64_t Size ) {
105
- return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
106
- DAG.getIntPtrConstant (Size , dl), MVT::i8);
107
- }
108
-
109
- // / Returns a REP STOS instruction, possibly with a few load/stores to implement
110
- // / a constant size memory set. In some cases where we know REP MOVS is
111
- // / inefficient we return an empty SDValue so the calling code can either
112
- // / generate a store sequence or call the runtime memset function.
113
- static SDValue emitConstantSizeRepstos (SelectionDAG &DAG,
114
- const X86Subtarget &Subtarget,
115
- const SDLoc &dl, SDValue Chain,
116
- SDValue Dst, SDValue Val, uint64_t Size ,
117
- EVT SizeVT, Align Alignment,
118
- bool isVolatile, bool AlwaysInline,
119
- MachinePointerInfo DstPtrInfo) {
120
- // / In case we optimize for size, we use repstosb even if it's less efficient
121
- // / so we can save the loads/stores of the leftover.
122
- if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
123
- if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
124
- // Special case 0 because otherwise we get large literals,
125
- // which causes larger encoding.
126
- if ((Size & 31 ) == 0 && (ValC->getZExtValue () & 255 ) == 0 ) {
127
- MVT BlockType = MVT::i32;
128
- const uint64_t BlockBits = BlockType.getSizeInBits ();
129
- const uint64_t BlockBytes = BlockBits / 8 ;
130
- const uint64_t BlockCount = Size / BlockBytes;
131
-
132
- Val = DAG.getConstant (0 , dl, BlockType);
133
- // repstosd is same size as repstosb
134
- return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
135
- DAG.getIntPtrConstant (BlockCount, dl), BlockType);
136
- }
137
- }
138
- return emitRepstosB (Subtarget, DAG, dl, Chain, Dst, Val, Size );
139
- }
47
+ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
48
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49
+ SDValue Size , Align Alignment, bool isVolatile, bool AlwaysInline,
50
+ MachinePointerInfo DstPtrInfo) const {
51
+ // If to a segment-relative address space, use the default lowering.
52
+ if (DstPtrInfo.getAddrSpace () >= 256 )
53
+ return SDValue ();
140
54
141
- if (Size > Subtarget.getMaxInlineSizeThreshold ())
55
+ // If the base register might conflict with our physical registers, bail out.
56
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57
+ X86::ECX, X86::EAX, X86::EDI};
58
+ if (isBaseRegConflictPossible (DAG, ClobberSet))
142
59
return SDValue ();
143
60
61
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size );
62
+ const X86Subtarget &Subtarget =
63
+ DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
64
+
144
65
// If not DWORD aligned or size is more than the threshold, call the library.
145
66
// The libc version is likely to be faster for these cases. It can use the
146
67
// address value and run time information about the CPU.
147
- if (Alignment < Align (4 ))
68
+ if (Alignment < Align (4 ) || !ConstantSize ||
69
+ ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
148
70
return SDValue ();
149
71
150
- MVT BlockType = MVT::i8;
151
- uint64_t BlockCount = Size ;
152
- uint64_t BytesLeft = 0 ;
72
+ uint64_t SizeVal = ConstantSize->getZExtValue ();
73
+ SDValue InGlue;
74
+ EVT AVT;
75
+ SDValue Count;
76
+ unsigned BytesLeft = 0 ;
153
77
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
154
- BlockType = getOptimalRepType (Subtarget, Alignment);
155
- uint64_t Value = ValC->getZExtValue () & 255 ;
156
- const uint64_t BlockBits = BlockType.getSizeInBits ();
78
+ unsigned ValReg;
79
+ uint64_t Val = ValC->getZExtValue () & 255 ;
80
+
81
+ // If the value is a constant, then we can potentially use larger sets.
82
+ if (Alignment >= Align (4 )) {
83
+ // DWORD aligned
84
+ AVT = MVT::i32;
85
+ ValReg = X86::EAX;
86
+ Val = (Val << 8 ) | Val;
87
+ Val = (Val << 16 ) | Val;
88
+ if (Subtarget.is64Bit () && Alignment >= Align (8 )) { // QWORD aligned
89
+ AVT = MVT::i64;
90
+ ValReg = X86::RAX;
91
+ Val = (Val << 32 ) | Val;
92
+ }
93
+ } else if (Alignment == Align (2 )) {
94
+ // WORD aligned
95
+ AVT = MVT::i16;
96
+ ValReg = X86::AX;
97
+ Val = (Val << 8 ) | Val;
98
+ } else {
99
+ // Byte aligned
100
+ AVT = MVT::i8;
101
+ ValReg = X86::AL;
102
+ Count = DAG.getIntPtrConstant (SizeVal, dl);
103
+ }
157
104
158
- if (BlockBits >= 16 )
159
- Value = (Value << 8 ) | Value;
105
+ if (AVT.bitsGT (MVT::i8)) {
106
+ unsigned UBytes = AVT.getSizeInBits () / 8 ;
107
+ Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
108
+ BytesLeft = SizeVal % UBytes;
109
+ }
160
110
161
- if (BlockBits >= 32 )
162
- Value = (Value << 16 ) | Value;
111
+ Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
112
+ InGlue);
113
+ InGlue = Chain.getValue (1 );
114
+ } else {
115
+ AVT = MVT::i8;
116
+ Count = DAG.getIntPtrConstant (SizeVal, dl);
117
+ Chain = DAG.getCopyToReg (Chain, dl, X86::AL, Val, InGlue);
118
+ InGlue = Chain.getValue (1 );
119
+ }
163
120
164
- if (BlockBits >= 64 )
165
- Value = (Value << 32 ) | Value;
121
+ bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
122
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123
+ Count, InGlue);
124
+ InGlue = Chain.getValue (1 );
125
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126
+ Dst, InGlue);
127
+ InGlue = Chain.getValue (1 );
166
128
167
- const uint64_t BlockBytes = BlockBits / 8 ;
168
- BlockCount = Size / BlockBytes;
169
- BytesLeft = Size % BlockBytes;
170
- Val = DAG.getConstant (Value, dl, BlockType);
171
- }
129
+ SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
130
+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
131
+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
172
132
173
- SDValue RepStos =
174
- emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
175
- DAG.getIntPtrConstant (BlockCount, dl), BlockType);
176
133
// / RepStos can process the whole length.
177
134
if (BytesLeft == 0 )
178
135
return RepStos;
179
136
180
137
// Handle the last 1 - 7 bytes.
181
138
SmallVector<SDValue, 4 > Results;
182
139
Results.push_back (RepStos);
183
- unsigned Offset = Size - BytesLeft;
140
+ unsigned Offset = SizeVal - BytesLeft;
184
141
EVT AddrVT = Dst.getValueType ();
142
+ EVT SizeVT = Size .getValueType ();
185
143
186
144
Results.push_back (
187
145
DAG.getMemset (Chain, dl,
@@ -194,31 +152,6 @@ static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
194
152
return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
195
153
}
196
154
197
- SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
198
- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
199
- SDValue Size , Align Alignment, bool isVolatile, bool AlwaysInline,
200
- MachinePointerInfo DstPtrInfo) const {
201
- // If to a segment-relative address space, use the default lowering.
202
- if (DstPtrInfo.getAddrSpace () >= 256 )
203
- return SDValue ();
204
-
205
- // If the base register might conflict with our physical registers, bail out.
206
- const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
207
- X86::ECX, X86::EAX, X86::EDI};
208
- if (isBaseRegConflictPossible (DAG, ClobberSet))
209
- return SDValue ();
210
-
211
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size );
212
- if (!ConstantSize)
213
- return SDValue ();
214
-
215
- const X86Subtarget &Subtarget =
216
- DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
217
- return emitConstantSizeRepstos (
218
- DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue (),
219
- Size .getValueType (), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
220
- }
221
-
222
155
// / Emit a single REP MOVS{B,W,D,Q} instruction.
223
156
static SDValue emitRepmovs (const X86Subtarget &Subtarget, SelectionDAG &DAG,
224
157
const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -249,6 +182,24 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
249
182
DAG.getIntPtrConstant (Size , dl), MVT::i8);
250
183
}
251
184
185
+ // / Returns the best type to use with repmovs depending on alignment.
186
+ static MVT getOptimalRepmovsType (const X86Subtarget &Subtarget,
187
+ Align Alignment) {
188
+ uint64_t Align = Alignment.value ();
189
+ assert ((Align != 0 ) && " Align is normalized" );
190
+ assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
191
+ switch (Align) {
192
+ case 1 :
193
+ return MVT::i8;
194
+ case 2 :
195
+ return MVT::i16;
196
+ case 4 :
197
+ return MVT::i32;
198
+ default :
199
+ return Subtarget.is64Bit () ? MVT::i64 : MVT::i32;
200
+ }
201
+ }
202
+
252
203
// / Returns a REP MOVS instruction, possibly with a few load/stores to implement
253
204
// / a constant size memory copy. In some cases where we know REP MOVS is
254
205
// / inefficient we return an empty SDValue so the calling code can either
@@ -258,10 +209,6 @@ static SDValue emitConstantSizeRepmov(
258
209
SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size , EVT SizeVT,
259
210
Align Alignment, bool isVolatile, bool AlwaysInline,
260
211
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
261
- // / In case we optimize for size, we use repmovsb even if it's less efficient
262
- // / so we can save the loads/stores of the leftover.
263
- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
264
- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
265
212
266
213
// / TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
267
214
// / efficient.
@@ -275,10 +222,10 @@ static SDValue emitConstantSizeRepmov(
275
222
assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
276
223
// / We assume runtime memcpy will do a better job for unaligned copies when
277
224
// / ERMS is not present.
278
- if (!AlwaysInline && (Alignment < Align ( 4 )) )
225
+ if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
279
226
return SDValue ();
280
227
281
- const MVT BlockType = getOptimalRepType (Subtarget, Alignment);
228
+ const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
282
229
const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
283
230
const uint64_t BlockCount = Size / BlockBytes;
284
231
const uint64_t BytesLeft = Size % BlockBytes;
@@ -292,6 +239,11 @@ static SDValue emitConstantSizeRepmov(
292
239
293
240
assert (BytesLeft && " We have leftover at this point" );
294
241
242
+ // / In case we optimize for size we use repmovsb even if it's less efficient
243
+ // / so we can save the loads/stores of the leftover.
244
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
245
+ return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
246
+
295
247
// Handle the last 1 - 7 bytes.
296
248
SmallVector<SDValue, 4 > Results;
297
249
Results.push_back (RepMovs);
@@ -330,7 +282,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
330
282
if (UseFSRMForMemcpy && Subtarget.hasFSRM ())
331
283
return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src, Size , MVT::i8);
332
284
333
- // / Handle constant sizes
285
+ // / Handle constant sizes,
334
286
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size ))
335
287
return emitConstantSizeRepmov (DAG, Subtarget, dl, Chain, Dst, Src,
336
288
ConstantSize->getZExtValue (),
0 commit comments