Skip to content

Commit 267bb33

Browse files
committed
[X86] For minsize, use size for alignment, rather than actual alignment
If we have minsize, then don't care about the alignment. On x86, the CPU doesn't care and neither should you. As long as the count is aligned, we can use less instructions.
1 parent 2606c87 commit 267bb33

File tree

2 files changed

+85
-169
lines changed

2 files changed

+85
-169
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

Lines changed: 81 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6767
// The libc version is likely to be faster for these cases. It can use the
6868
// address value and run time information about the CPU.
6969
if (Alignment < Align(4) || !ConstantSize ||
70-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
70+
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
7171
return SDValue();
7272

73+
// If we have minsize, then don't care about the alignment.
74+
// On x86, the CPU doesn't care and neither should you.
75+
// As long as the count is aligned, we can use the minimum number of
76+
// instructions without always having to resort to strosb.
77+
//
78+
// Because this is a feature specific to x86, we must handle it here.
7379
uint64_t SizeVal = ConstantSize->getZExtValue();
80+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
81+
if ((SizeVal & 7) == 0 && Subtarget.is64Bit())
82+
Alignment = Align(8);
83+
else if ((SizeVal & 3) == 0)
84+
Alignment = Align(4);
85+
else if ((SizeVal & 1) == 0)
86+
Alignment = Align(2);
87+
else
88+
Alignment = Align(1);
89+
}
90+
7491
SDValue InGlue;
7592
EVT AVT;
7693
SDValue Count;
@@ -103,51 +120,56 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103120
Count = DAG.getIntPtrConstant(SizeVal, dl);
104121
}
105122

106-
if (AVT.bitsGT(MVT::i8)) {
107-
unsigned UBytes = AVT.getSizeInBits() / 8;
108-
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
109-
BytesLeft = SizeVal % UBytes;
110-
}
111-
123+
const uint64_t BlockBytes = AVT.getSizeInBits() / 8;
124+
const uint64_t BlockCount = SizeVal / BlockBytes;
125+
Count = DAG.getIntPtrConstant(BlockCount, dl);
126+
BytesLeft = SizeVal % BlockBytes;
112127
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
113128
InGlue);
114129
InGlue = Chain.getValue(1);
115130
} else {
116131
AVT = MVT::i8;
117-
Count = DAG.getIntPtrConstant(SizeVal, dl);
118-
Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InGlue);
132+
Count = DAG.getIntPtrConstant(SizeVal, dl);
133+
Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InGlue);
119134
InGlue = Chain.getValue(1);
120135
}
121136

122137
bool Use64BitRegs = Subtarget.isTarget64BitLP64();
123-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124-
Count, InGlue);
138+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
139+
InGlue);
125140
InGlue = Chain.getValue(1);
126-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127-
Dst, InGlue);
141+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
142+
InGlue);
128143
InGlue = Chain.getValue(1);
129144

130145
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
131-
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue };
132-
Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
133-
134-
if (BytesLeft) {
135-
// Handle the last 1 - 7 bytes.
136-
unsigned Offset = SizeVal - BytesLeft;
137-
EVT AddrVT = Dst.getValueType();
138-
EVT SizeVT = Size.getValueType();
139-
140-
Chain =
141-
DAG.getMemset(Chain, dl,
142-
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
143-
DAG.getConstant(Offset, dl, AddrVT)),
144-
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
145-
isVolatile, AlwaysInline,
146-
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
147-
}
146+
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
147+
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
148+
149+
/// RepStos can process the whole length.
150+
//
151+
// Because we changed the alignment earlier in the function to work on size
152+
// when we have the minsize attribute, this is guaranteed to be 0 when we get
153+
// here.
154+
if (BytesLeft == 0)
155+
return RepStos;
148156

149-
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150-
return Chain;
157+
// Handle the last 1 - 7 bytes.
158+
SmallVector<SDValue, 4> Results;
159+
Results.push_back(RepStos);
160+
unsigned Offset = SizeVal - BytesLeft;
161+
EVT AddrVT = Dst.getValueType();
162+
EVT SizeVT = Size.getValueType();
163+
164+
Results.push_back(
165+
DAG.getMemset(Chain, dl,
166+
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
167+
DAG.getConstant(Offset, dl, AddrVT)),
168+
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
169+
isVolatile, /* isAlwaysInline */ true,
170+
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)));
171+
172+
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
151173
}
152174

153175
/// Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +242,39 @@ static SDValue emitConstantSizeRepmov(
220242
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
221243
/// We assume runtime memcpy will do a better job for unaligned copies when
222244
/// ERMS is not present.
223-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
245+
if (!AlwaysInline && (Alignment < Align(4)))
224246
return SDValue();
225247

248+
// If we have minsize, then don't care about the alignment.
249+
// On x86, the CPU doesn't care and neither should you.
250+
// As long as the count is aligned, we can use the minimum number of
251+
// instructions without always having to resort to movsb
252+
//
253+
// Because this is a feature specific to x86, we must handle it here.
254+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
255+
if ((Size & 7) == 0 && Subtarget.is64Bit())
256+
Alignment = Align(8);
257+
else if ((Size & 3) == 0)
258+
Alignment = Align(4);
259+
else if ((Size & 1) == 0)
260+
Alignment = Align(2);
261+
else
262+
Alignment = Align(1);
263+
}
264+
226265
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
227266
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
228267
const uint64_t BlockCount = Size / BlockBytes;
229268
const uint64_t BytesLeft = Size % BlockBytes;
269+
270+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
271+
// Use the one instruction determined. Because we changed the alignment
272+
// earlier in the function to work on size when we have the minsize
273+
// attribute, it is guaranteed to process the entire length.
274+
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
275+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
276+
}
277+
230278
SDValue RepMovs =
231279
emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
232280
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
@@ -237,11 +285,6 @@ static SDValue emitConstantSizeRepmov(
237285

238286
assert(BytesLeft && "We have leftover at this point");
239287

240-
/// In case we optimize for size we use repmovsb even if it's less efficient
241-
/// so we can save the loads/stores of the leftover.
242-
if (DAG.getMachineFunction().getFunction().hasMinSize())
243-
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
244-
245288
// Handle the last 1 - 7 bytes.
246289
SmallVector<SDValue, 4> Results;
247290
Results.push_back(RepMovs);

llvm/test/CodeGen/X86/memset-vs-memset-inline.ll

Lines changed: 4 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -28,137 +28,10 @@ define void @regular_memset_calls_external_function(ptr %a, i8 %value) nounwind
2828
define void @inlined_set_doesnt_call_external_function(ptr %a, i8 %value) nounwind {
2929
; CHECK-LABEL: inlined_set_doesnt_call_external_function:
3030
; CHECK: # %bb.0:
31-
; CHECK-NEXT: movzbl %sil, %ecx
32-
; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
33-
; CHECK-NEXT: imulq %rcx, %rax
34-
; CHECK-NEXT: movq %rax, 1016(%rdi)
35-
; CHECK-NEXT: movq %rax, 1008(%rdi)
36-
; CHECK-NEXT: movq %rax, 1000(%rdi)
37-
; CHECK-NEXT: movq %rax, 992(%rdi)
38-
; CHECK-NEXT: movq %rax, 984(%rdi)
39-
; CHECK-NEXT: movq %rax, 976(%rdi)
40-
; CHECK-NEXT: movq %rax, 968(%rdi)
41-
; CHECK-NEXT: movq %rax, 960(%rdi)
42-
; CHECK-NEXT: movq %rax, 952(%rdi)
43-
; CHECK-NEXT: movq %rax, 944(%rdi)
44-
; CHECK-NEXT: movq %rax, 936(%rdi)
45-
; CHECK-NEXT: movq %rax, 928(%rdi)
46-
; CHECK-NEXT: movq %rax, 920(%rdi)
47-
; CHECK-NEXT: movq %rax, 912(%rdi)
48-
; CHECK-NEXT: movq %rax, 904(%rdi)
49-
; CHECK-NEXT: movq %rax, 896(%rdi)
50-
; CHECK-NEXT: movq %rax, 888(%rdi)
51-
; CHECK-NEXT: movq %rax, 880(%rdi)
52-
; CHECK-NEXT: movq %rax, 872(%rdi)
53-
; CHECK-NEXT: movq %rax, 864(%rdi)
54-
; CHECK-NEXT: movq %rax, 856(%rdi)
55-
; CHECK-NEXT: movq %rax, 848(%rdi)
56-
; CHECK-NEXT: movq %rax, 840(%rdi)
57-
; CHECK-NEXT: movq %rax, 832(%rdi)
58-
; CHECK-NEXT: movq %rax, 824(%rdi)
59-
; CHECK-NEXT: movq %rax, 816(%rdi)
60-
; CHECK-NEXT: movq %rax, 808(%rdi)
61-
; CHECK-NEXT: movq %rax, 800(%rdi)
62-
; CHECK-NEXT: movq %rax, 792(%rdi)
63-
; CHECK-NEXT: movq %rax, 784(%rdi)
64-
; CHECK-NEXT: movq %rax, 776(%rdi)
65-
; CHECK-NEXT: movq %rax, 768(%rdi)
66-
; CHECK-NEXT: movq %rax, 760(%rdi)
67-
; CHECK-NEXT: movq %rax, 752(%rdi)
68-
; CHECK-NEXT: movq %rax, 744(%rdi)
69-
; CHECK-NEXT: movq %rax, 736(%rdi)
70-
; CHECK-NEXT: movq %rax, 728(%rdi)
71-
; CHECK-NEXT: movq %rax, 720(%rdi)
72-
; CHECK-NEXT: movq %rax, 712(%rdi)
73-
; CHECK-NEXT: movq %rax, 704(%rdi)
74-
; CHECK-NEXT: movq %rax, 696(%rdi)
75-
; CHECK-NEXT: movq %rax, 688(%rdi)
76-
; CHECK-NEXT: movq %rax, 680(%rdi)
77-
; CHECK-NEXT: movq %rax, 672(%rdi)
78-
; CHECK-NEXT: movq %rax, 664(%rdi)
79-
; CHECK-NEXT: movq %rax, 656(%rdi)
80-
; CHECK-NEXT: movq %rax, 648(%rdi)
81-
; CHECK-NEXT: movq %rax, 640(%rdi)
82-
; CHECK-NEXT: movq %rax, 632(%rdi)
83-
; CHECK-NEXT: movq %rax, 624(%rdi)
84-
; CHECK-NEXT: movq %rax, 616(%rdi)
85-
; CHECK-NEXT: movq %rax, 608(%rdi)
86-
; CHECK-NEXT: movq %rax, 600(%rdi)
87-
; CHECK-NEXT: movq %rax, 592(%rdi)
88-
; CHECK-NEXT: movq %rax, 584(%rdi)
89-
; CHECK-NEXT: movq %rax, 576(%rdi)
90-
; CHECK-NEXT: movq %rax, 568(%rdi)
91-
; CHECK-NEXT: movq %rax, 560(%rdi)
92-
; CHECK-NEXT: movq %rax, 552(%rdi)
93-
; CHECK-NEXT: movq %rax, 544(%rdi)
94-
; CHECK-NEXT: movq %rax, 536(%rdi)
95-
; CHECK-NEXT: movq %rax, 528(%rdi)
96-
; CHECK-NEXT: movq %rax, 520(%rdi)
97-
; CHECK-NEXT: movq %rax, 512(%rdi)
98-
; CHECK-NEXT: movq %rax, 504(%rdi)
99-
; CHECK-NEXT: movq %rax, 496(%rdi)
100-
; CHECK-NEXT: movq %rax, 488(%rdi)
101-
; CHECK-NEXT: movq %rax, 480(%rdi)
102-
; CHECK-NEXT: movq %rax, 472(%rdi)
103-
; CHECK-NEXT: movq %rax, 464(%rdi)
104-
; CHECK-NEXT: movq %rax, 456(%rdi)
105-
; CHECK-NEXT: movq %rax, 448(%rdi)
106-
; CHECK-NEXT: movq %rax, 440(%rdi)
107-
; CHECK-NEXT: movq %rax, 432(%rdi)
108-
; CHECK-NEXT: movq %rax, 424(%rdi)
109-
; CHECK-NEXT: movq %rax, 416(%rdi)
110-
; CHECK-NEXT: movq %rax, 408(%rdi)
111-
; CHECK-NEXT: movq %rax, 400(%rdi)
112-
; CHECK-NEXT: movq %rax, 392(%rdi)
113-
; CHECK-NEXT: movq %rax, 384(%rdi)
114-
; CHECK-NEXT: movq %rax, 376(%rdi)
115-
; CHECK-NEXT: movq %rax, 368(%rdi)
116-
; CHECK-NEXT: movq %rax, 360(%rdi)
117-
; CHECK-NEXT: movq %rax, 352(%rdi)
118-
; CHECK-NEXT: movq %rax, 344(%rdi)
119-
; CHECK-NEXT: movq %rax, 336(%rdi)
120-
; CHECK-NEXT: movq %rax, 328(%rdi)
121-
; CHECK-NEXT: movq %rax, 320(%rdi)
122-
; CHECK-NEXT: movq %rax, 312(%rdi)
123-
; CHECK-NEXT: movq %rax, 304(%rdi)
124-
; CHECK-NEXT: movq %rax, 296(%rdi)
125-
; CHECK-NEXT: movq %rax, 288(%rdi)
126-
; CHECK-NEXT: movq %rax, 280(%rdi)
127-
; CHECK-NEXT: movq %rax, 272(%rdi)
128-
; CHECK-NEXT: movq %rax, 264(%rdi)
129-
; CHECK-NEXT: movq %rax, 256(%rdi)
130-
; CHECK-NEXT: movq %rax, 248(%rdi)
131-
; CHECK-NEXT: movq %rax, 240(%rdi)
132-
; CHECK-NEXT: movq %rax, 232(%rdi)
133-
; CHECK-NEXT: movq %rax, 224(%rdi)
134-
; CHECK-NEXT: movq %rax, 216(%rdi)
135-
; CHECK-NEXT: movq %rax, 208(%rdi)
136-
; CHECK-NEXT: movq %rax, 200(%rdi)
137-
; CHECK-NEXT: movq %rax, 192(%rdi)
138-
; CHECK-NEXT: movq %rax, 184(%rdi)
139-
; CHECK-NEXT: movq %rax, 176(%rdi)
140-
; CHECK-NEXT: movq %rax, 168(%rdi)
141-
; CHECK-NEXT: movq %rax, 160(%rdi)
142-
; CHECK-NEXT: movq %rax, 152(%rdi)
143-
; CHECK-NEXT: movq %rax, 144(%rdi)
144-
; CHECK-NEXT: movq %rax, 136(%rdi)
145-
; CHECK-NEXT: movq %rax, 128(%rdi)
146-
; CHECK-NEXT: movq %rax, 120(%rdi)
147-
; CHECK-NEXT: movq %rax, 112(%rdi)
148-
; CHECK-NEXT: movq %rax, 104(%rdi)
149-
; CHECK-NEXT: movq %rax, 96(%rdi)
150-
; CHECK-NEXT: movq %rax, 88(%rdi)
151-
; CHECK-NEXT: movq %rax, 80(%rdi)
152-
; CHECK-NEXT: movq %rax, 72(%rdi)
153-
; CHECK-NEXT: movq %rax, 64(%rdi)
154-
; CHECK-NEXT: movq %rax, 56(%rdi)
155-
; CHECK-NEXT: movq %rax, 48(%rdi)
156-
; CHECK-NEXT: movq %rax, 40(%rdi)
157-
; CHECK-NEXT: movq %rax, 32(%rdi)
158-
; CHECK-NEXT: movq %rax, 24(%rdi)
159-
; CHECK-NEXT: movq %rax, 16(%rdi)
160-
; CHECK-NEXT: movq %rax, 8(%rdi)
161-
; CHECK-NEXT: movq %rax, (%rdi)
31+
; CHECK-NEXT: movl %esi, %eax
32+
; CHECK-NEXT: movl $1024, %ecx # imm = 0x400
33+
; CHECK-NEXT: # kill: def $al killed $al killed $eax
34+
; CHECK-NEXT: rep;stosb %al, %es:(%rdi)
16235
; CHECK-NEXT: retq
16336
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 1024, i1 0)
16437
ret void

0 commit comments

Comments
 (0)