Skip to content

Commit 73ef397

Browse files
authored
[libc][x86] Use prefetch for write for memcpy (llvm#90450)
Currently when `LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING` is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.
1 parent 0a48482 commit 73ef397

File tree

1 file changed

+20
-13
lines changed

1 file changed

+20
-13
lines changed

libc/src/string/memory_utils/x86_64/inline_memcpy.h

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
6969
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
7070
}
7171

72+
[[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch(Ptr __restrict dst,
73+
CPtr __restrict src,
74+
size_t distance) {
75+
prefetch_to_local_cache(src + distance);
76+
prefetch_for_write(dst + distance);
77+
}
78+
7279
[[maybe_unused]] LIBC_INLINE void
7380
inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
7481
CPtr __restrict src, size_t count) {
7582
using namespace LIBC_NAMESPACE::x86;
76-
prefetch_to_local_cache(src + K_ONE_CACHELINE);
83+
inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
7784
if (count <= 128)
7885
return builtin::Memcpy<64>::head_tail(dst, src, count);
79-
prefetch_to_local_cache(src + K_TWO_CACHELINES);
86+
inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
8087
// Aligning 'dst' on a 32B boundary.
8188
builtin::Memcpy<32>::block(dst, src);
8289
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
9097
if (count < 352) {
9198
// Two cache lines at a time.
9299
while (offset + K_TWO_CACHELINES + 32 <= count) {
93-
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
94-
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
100+
inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
101+
inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
95102
builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset);
96103
offset += K_TWO_CACHELINES;
97104
}
98105
} else {
99106
// Three cache lines at a time.
100107
while (offset + K_THREE_CACHELINES + 32 <= count) {
101-
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
102-
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
103-
prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
108+
inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
109+
inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
110+
inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
104111
// It is likely that this copy will be turned into a 'rep;movsb' on
105112
// non-AVX machines.
106113
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
@@ -120,11 +127,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
120127
inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
121128
CPtr __restrict src, size_t count) {
122129
using namespace LIBC_NAMESPACE::x86;
123-
prefetch_to_local_cache(src + K_ONE_CACHELINE);
130+
inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
124131
if (count <= 128)
125132
return builtin::Memcpy<64>::head_tail(dst, src, count);
126-
prefetch_to_local_cache(src + K_TWO_CACHELINES);
127-
prefetch_to_local_cache(src + K_THREE_CACHELINES);
133+
inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
134+
inline_memcpy_prefetch(dst, src, K_THREE_CACHELINES);
128135
if (count < 256)
129136
return builtin::Memcpy<128>::head_tail(dst, src, count);
130137
// Aligning 'dst' on a 32B boundary.
@@ -139,9 +146,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
139146
// - count >= 128.
140147
while (offset + K_THREE_CACHELINES + 64 <= count) {
141148
// Three cache lines at a time.
142-
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
143-
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
144-
prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
149+
inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
150+
inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
151+
inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
145152
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
146153
offset += K_THREE_CACHELINES;
147154
}

0 commit comments

Comments
 (0)