Skip to content

Commit bca3456

Browse files
committed
[X86] For minsize memset/memcpy, use byte or double-word accesses
repstosb and repstosd are the same size, but stosd is only done for 0 because the process of multiplying the constant so that it is copied across the bytes of the 32-bit number adds extra instructions that cause the size to increase. For 0, repstosb and repstosd are the same size, but stosd is only done for 0 because the process of multiplying the constant so that it is copied across the bytes of the 32-bit number adds extra instructions that cause the size to increase. For 0, we do not need to do that at all. For memcpy, the same goes, and as a result the minsize check was moved ahead because a jmp to memcpy encoded takes more bytes than repmovsb.
1 parent 5b2b92e commit bca3456

File tree

5 files changed

+81
-188
lines changed

5 files changed

+81
-188
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,14 @@ static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
9898
return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
9999
}
100100

101+
/// Emit a single REP STOSB instruction for a particular constant size.
102+
static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
103+
const SDLoc &dl, SDValue Chain, SDValue Dst,
104+
SDValue Val, uint64_t Size) {
105+
return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
106+
DAG.getIntPtrConstant(Size, dl), MVT::i8);
107+
}
108+
101109
/// Returns a REP STOS instruction, possibly with a few load/stores to implement
102110
/// a constant size memory set. In some cases where we know REP MOVS is
103111
/// inefficient we return an empty SDValue so the calling code can either
@@ -109,6 +117,26 @@ static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
109117
EVT SizeVT, Align Alignment,
110118
bool isVolatile, bool AlwaysInline,
111119
MachinePointerInfo DstPtrInfo) {
120+
/// In case we optimize for size, we use repstosb even if it's less efficient
121+
/// so we can save the loads/stores of the leftover.
122+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
123+
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
124+
// Special case 0 because otherwise we get large literals,
125+
// which causes larger encoding.
126+
if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
127+
MVT BlockType = MVT::i32;
128+
const uint64_t BlockBits = BlockType.getSizeInBits();
129+
const uint64_t BlockBytes = BlockBits / 8;
130+
const uint64_t BlockCount = Size / BlockBytes;
131+
132+
Val = DAG.getConstant(0, dl, BlockType);
133+
// repstosd is same size as repstosb
134+
return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
135+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
136+
}
137+
}
138+
return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
139+
}
112140

113141
if (Size > Subtarget.getMaxInlineSizeThreshold())
114142
return SDValue();
@@ -230,6 +258,10 @@ static SDValue emitConstantSizeRepmov(
230258
SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
231259
Align Alignment, bool isVolatile, bool AlwaysInline,
232260
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
261+
/// In case we optimize for size, we use repmovsb even if it's less efficient
262+
/// so we can save the loads/stores of the leftover.
263+
if (DAG.getMachineFunction().getFunction().hasMinSize())
264+
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
233265

234266
/// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
235267
/// efficient.
@@ -260,11 +292,6 @@ static SDValue emitConstantSizeRepmov(
260292

261293
assert(BytesLeft && "We have leftover at this point");
262294

263-
/// In case we optimize for size, we use repmovsb even if it's less efficient
264-
/// so we can save the loads/stores of the leftover.
265-
if (DAG.getMachineFunction().getFunction().hasMinSize())
266-
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
267-
268295
// Handle the last 1 - 7 bytes.
269296
SmallVector<SDValue, 4> Results;
270297
Results.push_back(RepMovs);

llvm/test/CodeGen/X86/memcpy-struct-by-value.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
7878
; NOFAST32-NEXT: pushl %esi
7979
; NOFAST32-NEXT: subl $4100, %esp # imm = 0x1004
8080
; NOFAST32-NEXT: movl {{[0-9]+}}(%esp), %esi
81-
; NOFAST32-NEXT: movl $1024, %ecx # imm = 0x400
81+
; NOFAST32-NEXT: movl $4096, %ecx # imm = 0x1000
8282
; NOFAST32-NEXT: movl %esp, %edi
83-
; NOFAST32-NEXT: rep;movsl (%esi), %es:(%edi)
83+
; NOFAST32-NEXT: rep;movsb (%esi), %es:(%edi)
8484
; NOFAST32-NEXT: calll foo@PLT
8585
; NOFAST32-NEXT: addl $4100, %esp # imm = 0x1004
8686
; NOFAST32-NEXT: popl %esi
@@ -106,9 +106,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
106106
; NOFAST: # %bb.0:
107107
; NOFAST-NEXT: subq $4104, %rsp # imm = 0x1008
108108
; NOFAST-NEXT: movq %rdi, %rsi
109-
; NOFAST-NEXT: movl $512, %ecx # imm = 0x200
109+
; NOFAST-NEXT: movl $4096, %ecx # imm = 0x1000
110110
; NOFAST-NEXT: movq %rsp, %rdi
111-
; NOFAST-NEXT: rep;movsq (%rsi), %es:(%rdi)
111+
; NOFAST-NEXT: rep;movsb (%rsi), %es:(%rdi)
112112
; NOFAST-NEXT: callq foo@PLT
113113
; NOFAST-NEXT: addq $4104, %rsp # imm = 0x1008
114114
; NOFAST-NEXT: retq

llvm/test/CodeGen/X86/memcpy.ll

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,14 +202,16 @@ define void @test3_minsize(ptr nocapture %A, ptr nocapture %B) nounwind minsize
202202
; DARWIN-LABEL: test3_minsize:
203203
; DARWIN: ## %bb.0:
204204
; DARWIN-NEXT: pushq $64
205-
; DARWIN-NEXT: popq %rdx
206-
; DARWIN-NEXT: jmp _memcpy ## TAILCALL
205+
; DARWIN-NEXT: popq %rcx
206+
; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi)
207+
; DARWIN-NEXT: retq
207208
;
208209
; LINUX-LABEL: test3_minsize:
209210
; LINUX: # %bb.0:
210211
; LINUX-NEXT: pushq $64
211-
; LINUX-NEXT: popq %rdx
212-
; LINUX-NEXT: jmp memcpy@PLT # TAILCALL
212+
; LINUX-NEXT: popq %rcx
213+
; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi)
214+
; LINUX-NEXT: retq
213215
;
214216
; LINUX-SKL-LABEL: test3_minsize:
215217
; LINUX-SKL: # %bb.0:
@@ -249,14 +251,16 @@ define void @test3_minsize_optsize(ptr nocapture %A, ptr nocapture %B) nounwind
249251
; DARWIN-LABEL: test3_minsize_optsize:
250252
; DARWIN: ## %bb.0:
251253
; DARWIN-NEXT: pushq $64
252-
; DARWIN-NEXT: popq %rdx
253-
; DARWIN-NEXT: jmp _memcpy ## TAILCALL
254+
; DARWIN-NEXT: popq %rcx
255+
; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi)
256+
; DARWIN-NEXT: retq
254257
;
255258
; LINUX-LABEL: test3_minsize_optsize:
256259
; LINUX: # %bb.0:
257260
; LINUX-NEXT: pushq $64
258-
; LINUX-NEXT: popq %rdx
259-
; LINUX-NEXT: jmp memcpy@PLT # TAILCALL
261+
; LINUX-NEXT: popq %rcx
262+
; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi)
263+
; LINUX-NEXT: retq
260264
;
261265
; LINUX-SKL-LABEL: test3_minsize_optsize:
262266
; LINUX-SKL: # %bb.0:

llvm/test/CodeGen/X86/memset-minsize.ll

Lines changed: 30 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,9 @@ entry:
2727
define void @medium_memset_to_rep_stos(ptr %ptr) minsize nounwind {
2828
; CHECK-LABEL: medium_memset_to_rep_stos:
2929
; CHECK: # %bb.0: # %entry
30-
; CHECK-NEXT: pushq %rax
31-
; CHECK-NEXT: movl $512, %edx # imm = 0x200
32-
; CHECK-NEXT: xorl %esi, %esi
33-
; CHECK-NEXT: callq memset@PLT
34-
; CHECK-NEXT: popq %rax
30+
; CHECK-NEXT: movl $128, %ecx
31+
; CHECK-NEXT: xorl %eax, %eax
32+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
3533
; CHECK-NEXT: retq
3634
entry:
3735
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 512, i1 false)
@@ -41,11 +39,9 @@ entry:
4139
define void @large_memset_to_rep_stos(ptr %ptr) minsize nounwind {
4240
; CHECK-LABEL: large_memset_to_rep_stos:
4341
; CHECK: # %bb.0: # %entry
44-
; CHECK-NEXT: pushq %rax
45-
; CHECK-NEXT: movl $4096, %edx # imm = 0x1000
46-
; CHECK-NEXT: xorl %esi, %esi
47-
; CHECK-NEXT: callq memset@PLT
48-
; CHECK-NEXT: popq %rax
42+
; CHECK-NEXT: movl $1024, %ecx # imm = 0x400
43+
; CHECK-NEXT: xorl %eax, %eax
44+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
4945
; CHECK-NEXT: retq
5046
entry:
5147
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 4096, i1 false)
@@ -55,11 +51,9 @@ entry:
5551
define void @huge_memset_to_rep_stos(ptr %ptr) minsize nounwind {
5652
; CHECK-LABEL: huge_memset_to_rep_stos:
5753
; CHECK: # %bb.0: # %entry
58-
; CHECK-NEXT: pushq %rax
59-
; CHECK-NEXT: movl $8192, %edx # imm = 0x2000
60-
; CHECK-NEXT: xorl %esi, %esi
61-
; CHECK-NEXT: callq memset@PLT
62-
; CHECK-NEXT: popq %rax
54+
; CHECK-NEXT: movl $2048, %ecx # imm = 0x800
55+
; CHECK-NEXT: xorl %eax, %eax
56+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
6357
; CHECK-NEXT: retq
6458
entry:
6559
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 8192, i1 false)
@@ -69,11 +63,9 @@ entry:
6963
define void @odd_length_memset_to_rep_stos(ptr %ptr) minsize nounwind {
7064
; CHECK-LABEL: odd_length_memset_to_rep_stos:
7165
; CHECK: # %bb.0: # %entry
72-
; CHECK-NEXT: pushq %rax
73-
; CHECK-NEXT: movl $255, %edx
74-
; CHECK-NEXT: xorl %esi, %esi
75-
; CHECK-NEXT: callq memset@PLT
76-
; CHECK-NEXT: popq %rax
66+
; CHECK-NEXT: movl $255, %ecx
67+
; CHECK-NEXT: xorl %eax, %eax
68+
; CHECK-NEXT: rep;stosb %al, %es:(%rdi)
7769
; CHECK-NEXT: retq
7870
entry:
7971
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 255, i1 false)
@@ -83,11 +75,10 @@ entry:
8375
define void @align_1_memset_to_rep_stos(ptr %ptr) minsize nounwind {
8476
; CHECK-LABEL: align_1_memset_to_rep_stos:
8577
; CHECK: # %bb.0: # %entry
86-
; CHECK-NEXT: pushq %rax
87-
; CHECK-NEXT: movl $256, %edx # imm = 0x100
88-
; CHECK-NEXT: xorl %esi, %esi
89-
; CHECK-NEXT: callq memset@PLT
90-
; CHECK-NEXT: popq %rax
78+
; CHECK-NEXT: pushq $64
79+
; CHECK-NEXT: popq %rcx
80+
; CHECK-NEXT: xorl %eax, %eax
81+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
9182
; CHECK-NEXT: retq
9283
entry:
9384
call void @llvm.memset.p0.i32(ptr align 1 %ptr, i8 0, i32 256, i1 false)
@@ -97,11 +88,10 @@ entry:
9788
define void @align_2_memset_to_rep_stos(ptr %ptr) minsize nounwind {
9889
; CHECK-LABEL: align_2_memset_to_rep_stos:
9990
; CHECK: # %bb.0: # %entry
100-
; CHECK-NEXT: pushq %rax
101-
; CHECK-NEXT: movl $256, %edx # imm = 0x100
102-
; CHECK-NEXT: xorl %esi, %esi
103-
; CHECK-NEXT: callq memset@PLT
104-
; CHECK-NEXT: popq %rax
91+
; CHECK-NEXT: pushq $64
92+
; CHECK-NEXT: popq %rcx
93+
; CHECK-NEXT: xorl %eax, %eax
94+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
10595
; CHECK-NEXT: retq
10696
entry:
10797
call void @llvm.memset.p0.i32(ptr align 2 %ptr, i8 0, i32 256, i1 false)
@@ -111,11 +101,10 @@ entry:
111101
define void @align_4_memset_to_rep_stos(ptr %ptr) minsize nounwind {
112102
; CHECK-LABEL: align_4_memset_to_rep_stos:
113103
; CHECK: # %bb.0: # %entry
114-
; CHECK-NEXT: pushq %rax
115-
; CHECK-NEXT: movl $256, %edx # imm = 0x100
116-
; CHECK-NEXT: xorl %esi, %esi
117-
; CHECK-NEXT: callq memset@PLT
118-
; CHECK-NEXT: popq %rax
104+
; CHECK-NEXT: pushq $64
105+
; CHECK-NEXT: popq %rcx
106+
; CHECK-NEXT: xorl %eax, %eax
107+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
119108
; CHECK-NEXT: retq
120109
entry:
121110
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 256, i1 false)
@@ -125,11 +114,10 @@ entry:
125114
define void @align_8_memset_to_rep_stos(ptr %ptr) minsize nounwind {
126115
; CHECK-LABEL: align_8_memset_to_rep_stos:
127116
; CHECK: # %bb.0: # %entry
128-
; CHECK-NEXT: pushq %rax
129-
; CHECK-NEXT: movl $256, %edx # imm = 0x100
130-
; CHECK-NEXT: xorl %esi, %esi
131-
; CHECK-NEXT: callq memset@PLT
132-
; CHECK-NEXT: popq %rax
117+
; CHECK-NEXT: pushq $64
118+
; CHECK-NEXT: popq %rcx
119+
; CHECK-NEXT: xorl %eax, %eax
120+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
133121
; CHECK-NEXT: retq
134122
entry:
135123
call void @llvm.memset.p0.i32(ptr align 8 %ptr, i8 0, i32 256, i1 false)
@@ -139,10 +127,10 @@ entry:
139127
define void @small_memset_to_rep_stos_64(ptr %ptr) minsize nounwind {
140128
; CHECK-LABEL: small_memset_to_rep_stos_64:
141129
; CHECK: # %bb.0: # %entry
142-
; CHECK-NEXT: pushq $16
130+
; CHECK-NEXT: pushq $32
143131
; CHECK-NEXT: popq %rcx
144132
; CHECK-NEXT: xorl %eax, %eax
145-
; CHECK-NEXT: rep;stosq %rax, %es:(%rdi)
133+
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
146134
; CHECK-NEXT: retq
147135
entry:
148136
call void @llvm.memset.p0.i64(ptr align 8 %ptr, i8 0, i64 128, i1 false)

0 commit comments

Comments
 (0)