@@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
69
69
return builtin::Memcpy<64 >::loop_and_tail (dst, src, count);
70
70
}
71
71
72
+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch (Ptr __restrict dst,
73
+ CPtr __restrict src,
74
+ size_t distance) {
75
+ prefetch_to_local_cache (src + distance);
76
+ prefetch_for_write (dst + distance);
77
+ }
78
+
72
79
[[maybe_unused]] LIBC_INLINE void
73
80
inline_memcpy_x86_sse2_ge64_sw_prefetching (Ptr __restrict dst,
74
81
CPtr __restrict src, size_t count) {
75
82
using namespace LIBC_NAMESPACE ::x86;
76
- prefetch_to_local_cache (src + K_ONE_CACHELINE);
83
+ inline_memcpy_prefetch (dst, src, K_ONE_CACHELINE);
77
84
if (count <= 128 )
78
85
return builtin::Memcpy<64 >::head_tail (dst, src, count);
79
- prefetch_to_local_cache (src + K_TWO_CACHELINES);
86
+ inline_memcpy_prefetch (dst, src, K_TWO_CACHELINES);
80
87
// Aligning 'dst' on a 32B boundary.
81
88
builtin::Memcpy<32 >::block (dst, src);
82
89
align_to_next_boundary<32 , Arg::Dst>(dst, src, count);
@@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
90
97
if (count < 352 ) {
91
98
// Two cache lines at a time.
92
99
while (offset + K_TWO_CACHELINES + 32 <= count) {
93
- prefetch_to_local_cache (src + offset + K_ONE_CACHELINE);
94
- prefetch_to_local_cache (src + offset + K_TWO_CACHELINES);
100
+ inline_memcpy_prefetch (dst, src, offset + K_ONE_CACHELINE);
101
+ inline_memcpy_prefetch (dst, src, offset + K_TWO_CACHELINES);
95
102
builtin::Memcpy<K_TWO_CACHELINES>::block_offset (dst, src, offset);
96
103
offset += K_TWO_CACHELINES;
97
104
}
98
105
} else {
99
106
// Three cache lines at a time.
100
107
while (offset + K_THREE_CACHELINES + 32 <= count) {
101
- prefetch_to_local_cache (src + offset + K_ONE_CACHELINE);
102
- prefetch_to_local_cache (src + offset + K_TWO_CACHELINES);
103
- prefetch_to_local_cache (src + offset + K_THREE_CACHELINES);
108
+ inline_memcpy_prefetch (dst, src, offset + K_ONE_CACHELINE);
109
+ inline_memcpy_prefetch (dst, src, offset + K_TWO_CACHELINES);
110
+ inline_memcpy_prefetch (dst, src, offset + K_THREE_CACHELINES);
104
111
// It is likely that this copy will be turned into a 'rep;movsb' on
105
112
// non-AVX machines.
106
113
builtin::Memcpy<K_THREE_CACHELINES>::block_offset (dst, src, offset);
@@ -120,11 +127,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
120
127
inline_memcpy_x86_avx_ge64_sw_prefetching (Ptr __restrict dst,
121
128
CPtr __restrict src, size_t count) {
122
129
using namespace LIBC_NAMESPACE ::x86;
123
- prefetch_to_local_cache (src + K_ONE_CACHELINE);
130
+ inline_memcpy_prefetch (dst, src, K_ONE_CACHELINE);
124
131
if (count <= 128 )
125
132
return builtin::Memcpy<64 >::head_tail (dst, src, count);
126
- prefetch_to_local_cache (src + K_TWO_CACHELINES);
127
- prefetch_to_local_cache (src + K_THREE_CACHELINES);
133
+ inline_memcpy_prefetch (dst, src, K_TWO_CACHELINES);
134
+ inline_memcpy_prefetch (dst, src, K_THREE_CACHELINES);
128
135
if (count < 256 )
129
136
return builtin::Memcpy<128 >::head_tail (dst, src, count);
130
137
// Aligning 'dst' on a 32B boundary.
@@ -139,9 +146,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
139
146
// - count >= 128.
140
147
while (offset + K_THREE_CACHELINES + 64 <= count) {
141
148
// Three cache lines at a time.
142
- prefetch_to_local_cache (src + offset + K_ONE_CACHELINE);
143
- prefetch_to_local_cache (src + offset + K_TWO_CACHELINES);
144
- prefetch_to_local_cache (src + offset + K_THREE_CACHELINES);
149
+ inline_memcpy_prefetch (dst, src, offset + K_ONE_CACHELINE);
150
+ inline_memcpy_prefetch (dst, src, offset + K_TWO_CACHELINES);
151
+ inline_memcpy_prefetch (dst, src, offset + K_THREE_CACHELINES);
145
152
builtin::Memcpy<K_THREE_CACHELINES>::block_offset (dst, src, offset);
146
153
offset += K_THREE_CACHELINES;
147
154
}
0 commit comments