@@ -28,6 +28,23 @@ static cl::opt<bool>
28
28
UseFSRMForMemcpy (" x86-use-fsrm-for-memcpy" , cl::Hidden, cl::init(false ),
29
29
cl::desc(" Use fast short rep mov in memcpy lowering" ));
30
30
31
+ // / Returns the best type to use with repmovs/repstos depending on alignment.
32
+ static MVT getOptimalRepType (const X86Subtarget &Subtarget, Align Alignment) {
33
+ uint64_t Align = Alignment.value ();
34
+ assert ((Align != 0 ) && " Align is normalized" );
35
+ assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
36
+ switch (Align) {
37
+ case 1 :
38
+ return MVT::i8;
39
+ case 2 :
40
+ return MVT::i16;
41
+ case 4 :
42
+ return MVT::i32;
43
+ default :
44
+ return Subtarget.is64Bit () ? MVT::i64 : MVT::i32;
45
+ }
46
+ }
47
+
31
48
bool X86SelectionDAGInfo::isBaseRegConflictPossible (
32
49
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
33
50
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -44,102 +61,99 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
44
61
return llvm::is_contained (ClobberSet, TRI->getBaseRegister ());
45
62
}
46
63
47
- SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
48
- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49
- SDValue Size , Align Alignment, bool isVolatile, bool AlwaysInline,
50
- MachinePointerInfo DstPtrInfo) const {
51
- // If to a segment-relative address space, use the default lowering.
52
- if (DstPtrInfo.getAddrSpace () >= 256 )
53
- return SDValue ();
64
+ // / Emit a single REP STOSB instruction for a particular constant size.
65
+ static SDValue emitRepstos (const X86Subtarget &Subtarget, SelectionDAG &DAG,
66
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
67
+ SDValue Val, SDValue Size , MVT AVT) {
68
+ const bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
69
+ unsigned AX = X86::AL;
70
+ switch (AVT.getSizeInBits ()) {
71
+ case 8 :
72
+ AX = X86::AL;
73
+ break ;
74
+ case 16 :
75
+ AX = X86::AX;
76
+ break ;
77
+ case 32 :
78
+ AX = X86::EAX;
79
+ break ;
80
+ default :
81
+ AX = X86::RAX;
82
+ break ;
83
+ }
54
84
55
- // If the base register might conflict with our physical registers, bail out.
56
- const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57
- X86::ECX, X86::EAX, X86::EDI};
58
- if (isBaseRegConflictPossible (DAG, ClobberSet))
59
- return SDValue ();
85
+ const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86
+ const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
60
87
61
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size );
62
- const X86Subtarget &Subtarget =
63
- DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
88
+ SDValue InGlue;
89
+ Chain = DAG.getCopyToReg (Chain, dl, AX, Val, InGlue);
90
+ InGlue = Chain.getValue (1 );
91
+ Chain = DAG.getCopyToReg (Chain, dl, CX, Size , InGlue);
92
+ InGlue = Chain.getValue (1 );
93
+ Chain = DAG.getCopyToReg (Chain, dl, DI, Dst, InGlue);
94
+ InGlue = Chain.getValue (1 );
95
+
96
+ SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
97
+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
98
+ return DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
99
+ }
100
+
101
+ // / Returns a REP STOS instruction, possibly with a few load/stores to implement
102
+ // / a constant size memory set. In some cases where we know REP MOVS is
103
+ // / inefficient we return an empty SDValue so the calling code can either
104
+ // / generate a store sequence or call the runtime memset function.
105
+ static SDValue emitConstantSizeRepstos (SelectionDAG &DAG,
106
+ const X86Subtarget &Subtarget,
107
+ const SDLoc &dl, SDValue Chain,
108
+ SDValue Dst, SDValue Val, uint64_t Size ,
109
+ EVT SizeVT, Align Alignment,
110
+ bool isVolatile, bool AlwaysInline,
111
+ MachinePointerInfo DstPtrInfo) {
112
+
113
+ if (Size > Subtarget.getMaxInlineSizeThreshold ())
114
+ return SDValue ();
64
115
65
116
// If not DWORD aligned or size is more than the threshold, call the library.
66
117
// The libc version is likely to be faster for these cases. It can use the
67
118
// address value and run time information about the CPU.
68
- if (Alignment < Align (4 ) || !ConstantSize ||
69
- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
119
+ if (Alignment < Align (4 ))
70
120
return SDValue ();
71
121
72
- uint64_t SizeVal = ConstantSize->getZExtValue ();
73
- SDValue InGlue;
74
- EVT AVT;
75
- SDValue Count;
76
- unsigned BytesLeft = 0 ;
122
+ MVT BlockType = MVT::i8;
123
+ uint64_t BlockCount = Size ;
124
+ uint64_t BytesLeft = 0 ;
77
125
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
78
- unsigned ValReg;
79
- uint64_t Val = ValC->getZExtValue () & 255 ;
80
-
81
- // If the value is a constant, then we can potentially use larger sets.
82
- if (Alignment >= Align (4 )) {
83
- // DWORD aligned
84
- AVT = MVT::i32;
85
- ValReg = X86::EAX;
86
- Val = (Val << 8 ) | Val;
87
- Val = (Val << 16 ) | Val;
88
- if (Subtarget.is64Bit () && Alignment >= Align (8 )) { // QWORD aligned
89
- AVT = MVT::i64;
90
- ValReg = X86::RAX;
91
- Val = (Val << 32 ) | Val;
92
- }
93
- } else if (Alignment == Align (2 )) {
94
- // WORD aligned
95
- AVT = MVT::i16;
96
- ValReg = X86::AX;
97
- Val = (Val << 8 ) | Val;
98
- } else {
99
- // Byte aligned
100
- AVT = MVT::i8;
101
- ValReg = X86::AL;
102
- Count = DAG.getIntPtrConstant (SizeVal, dl);
103
- }
104
-
105
- if (AVT.bitsGT (MVT::i8)) {
106
- unsigned UBytes = AVT.getSizeInBits () / 8 ;
107
- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
108
- BytesLeft = SizeVal % UBytes;
109
- }
110
-
111
- Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
112
- InGlue);
113
- InGlue = Chain.getValue (1 );
114
- } else {
115
- AVT = MVT::i8;
116
- Count = DAG.getIntPtrConstant (SizeVal, dl);
117
- Chain = DAG.getCopyToReg (Chain, dl, X86::AL, Val, InGlue);
118
- InGlue = Chain.getValue (1 );
119
- }
126
+ BlockType = getOptimalRepType (Subtarget, Alignment);
127
+ uint64_t Value = ValC->getZExtValue () & 255 ;
128
+ const uint64_t BlockBits = BlockType.getSizeInBits ();
120
129
121
- bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
122
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123
- Count, InGlue);
124
- InGlue = Chain.getValue (1 );
125
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126
- Dst, InGlue);
127
- InGlue = Chain.getValue (1 );
130
+ if (BlockBits >= 16 )
131
+ Value = (Value << 8 ) | Value;
128
132
129
- SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
130
- SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
131
- SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
133
+ if (BlockBits >= 32 )
134
+ Value = (Value << 16 ) | Value;
135
+
136
+ if (BlockBits >= 64 )
137
+ Value = (Value << 32 ) | Value;
138
+
139
+ const uint64_t BlockBytes = BlockBits / 8 ;
140
+ BlockCount = Size / BlockBytes;
141
+ BytesLeft = Size % BlockBytes;
142
+ Val = DAG.getConstant (Value, dl, BlockType);
143
+ }
132
144
145
+ SDValue RepStos =
146
+ emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
147
+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
133
148
// / RepStos can process the whole length.
134
149
if (BytesLeft == 0 )
135
150
return RepStos;
136
151
137
152
// Handle the last 1 - 7 bytes.
138
153
SmallVector<SDValue, 4 > Results;
139
154
Results.push_back (RepStos);
140
- unsigned Offset = SizeVal - BytesLeft;
155
+ unsigned Offset = Size - BytesLeft;
141
156
EVT AddrVT = Dst.getValueType ();
142
- EVT SizeVT = Size .getValueType ();
143
157
144
158
Results.push_back (
145
159
DAG.getMemset (Chain, dl,
@@ -152,6 +166,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
152
166
return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
153
167
}
154
168
169
+ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
170
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
171
+ SDValue Size , Align Alignment, bool isVolatile, bool AlwaysInline,
172
+ MachinePointerInfo DstPtrInfo) const {
173
+ // If to a segment-relative address space, use the default lowering.
174
+ if (DstPtrInfo.getAddrSpace () >= 256 )
175
+ return SDValue ();
176
+
177
+ // If the base register might conflict with our physical registers, bail out.
178
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
179
+ X86::ECX, X86::EAX, X86::EDI};
180
+ if (isBaseRegConflictPossible (DAG, ClobberSet))
181
+ return SDValue ();
182
+
183
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size );
184
+ if (!ConstantSize)
185
+ return SDValue ();
186
+
187
+ const X86Subtarget &Subtarget =
188
+ DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
189
+ return emitConstantSizeRepstos (
190
+ DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue (),
191
+ Size .getValueType (), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
192
+ }
193
+
155
194
// / Emit a single REP MOVS{B,W,D,Q} instruction.
156
195
static SDValue emitRepmovs (const X86Subtarget &Subtarget, SelectionDAG &DAG,
157
196
const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -182,24 +221,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
182
221
DAG.getIntPtrConstant (Size , dl), MVT::i8);
183
222
}
184
223
185
- // / Returns the best type to use with repmovs depending on alignment.
186
- static MVT getOptimalRepmovsType (const X86Subtarget &Subtarget,
187
- Align Alignment) {
188
- uint64_t Align = Alignment.value ();
189
- assert ((Align != 0 ) && " Align is normalized" );
190
- assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
191
- switch (Align) {
192
- case 1 :
193
- return MVT::i8;
194
- case 2 :
195
- return MVT::i16;
196
- case 4 :
197
- return MVT::i32;
198
- default :
199
- return Subtarget.is64Bit () ? MVT::i64 : MVT::i32;
200
- }
201
- }
202
-
203
224
// / Returns a REP MOVS instruction, possibly with a few load/stores to implement
204
225
// / a constant size memory copy. In some cases where we know REP MOVS is
205
226
// / inefficient we return an empty SDValue so the calling code can either
@@ -222,10 +243,10 @@ static SDValue emitConstantSizeRepmov(
222
243
assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
223
244
// / We assume runtime memcpy will do a better job for unaligned copies when
224
245
// / ERMS is not present.
225
- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
246
+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
226
247
return SDValue ();
227
248
228
- const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
249
+ const MVT BlockType = getOptimalRepType (Subtarget, Alignment);
229
250
const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
230
251
const uint64_t BlockCount = Size / BlockBytes;
231
252
const uint64_t BytesLeft = Size % BlockBytes;
@@ -239,7 +260,7 @@ static SDValue emitConstantSizeRepmov(
239
260
240
261
assert (BytesLeft && " We have leftover at this point" );
241
262
242
- // / In case we optimize for size we use repmovsb even if it's less efficient
263
+ // / In case we optimize for size, we use repmovsb even if it's less efficient
243
264
// / so we can save the loads/stores of the leftover.
244
265
if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
245
266
return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
@@ -282,7 +303,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
282
303
if (UseFSRMForMemcpy && Subtarget.hasFSRM ())
283
304
return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src, Size , MVT::i8);
284
305
285
- // / Handle constant sizes,
306
+ // / Handle constant sizes
286
307
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size ))
287
308
return emitConstantSizeRepmov (DAG, Subtarget, dl, Chain, Dst, Src,
288
309
ConstantSize->getZExtValue (),
0 commit comments