Skip to content

Commit 6fe4e03

Browse files
martijnvelsldionne
authored andcommitted
[libc++] Optimize vector push_back to avoid continuous load and store of end pointer
Credits: this change is based on analysis and a proof of concept by [email protected]. Before, the compiler loses track of end as 'this' and other references possibly escape beyond the compiler's scope. This can be see in the generated assembly: 16.28 │200c80: mov %r15d,(%rax) 60.87 │200c83: add $0x4,%rax │200c87: mov %rax,-0x38(%rbp) 0.03 │200c8b: → jmpq 200d4e ... ... 1.69 │200d4e: cmp %r15d,%r12d │200d51: → je 200c40 16.34 │200d57: inc %r15d 0.05 │200d5a: mov -0x38(%rbp),%rax 3.27 │200d5e: mov -0x30(%rbp),%r13 1.47 │200d62: cmp %r13,%rax │200d65: → jne 200c80 We fix this by always explicitly storing the loaded local and pointer back at the end of push back. This generates some slight source 'noise', but creates nice and compact fast path code, i.e.: 32.64 │200760: mov %r14d,(%r12) 9.97 │200764: add $0x4,%r12 6.97 │200768: mov %r12,-0x38(%rbp) 32.17 │20076c: add $0x1,%r14d 2.36 │200770: cmp %r14d,%ebx │200773: → je 200730 8.98 │200775: mov -0x30(%rbp),%r13 6.75 │200779: cmp %r13,%r12 │20077c: → jne 200760 Now there is a single store for the push_back value (as before), and a single store for the end without a reload (dependency). For fully local vectors, (i.e., not referenced elsewhere), the capacity load and store inside the loop could also be removed, but this requires more substantial refactoring inside vector. Differential Revision: https://reviews.llvm.org/D80588
1 parent b52a5c6 commit 6fe4e03

File tree

3 files changed

+40
-17
lines changed

3 files changed

+40
-17
lines changed

libcxx/benchmarks/ContainerBenchmarks.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,19 @@ void BM_ConstructFromRange(benchmark::State& st, Container, GenInputs gen) {
7979
}
8080
}
8181

82+
template <class Container>
83+
void BM_Pushback(benchmark::State& state, Container c) {
84+
int count = state.range(0);
85+
c.reserve(count);
86+
while (state.KeepRunningBatch(count)) {
87+
c.clear();
88+
for (int i = 0; i != count; ++i) {
89+
c.push_back(i);
90+
}
91+
benchmark::DoNotOptimize(c.data());
92+
}
93+
}
94+
8295
template <class Container, class GenInputs>
8396
void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
8497
auto in = gen(st.range(0));

libcxx/benchmarks/vector_operations.bench.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,6 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_size_t, std::vector<size_t>{}, g
3939
BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector<std::string>{}, getRandomStringInputs)
4040
->Arg(TestNumInputs);
4141

42+
BENCHMARK_CAPTURE(BM_Pushback, vector_int, std::vector<int>{})->Arg(TestNumInputs);
43+
4244
BENCHMARK_MAIN();

libcxx/include/vector

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -833,11 +833,11 @@ private:
833833

834834
template <class _Up>
835835
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
836-
inline void __push_back_slow_path(_Up&& __x);
836+
inline pointer __push_back_slow_path(_Up&& __x);
837837

838838
template <class... _Args>
839839
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
840-
inline void __emplace_back_slow_path(_Args&&... __args);
840+
inline pointer __emplace_back_slow_path(_Args&&... __args);
841841

842842
// The following functions are no-ops outside of AddressSanitizer mode.
843843
// We call annotations for every allocator, unless explicitly disabled.
@@ -1609,7 +1609,7 @@ vector<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT
16091609
template <class _Tp, class _Allocator>
16101610
template <class _Up>
16111611
_LIBCPP_CONSTEXPR_SINCE_CXX20
1612-
void
1612+
typename vector<_Tp, _Allocator>::pointer
16131613
vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x)
16141614
{
16151615
allocator_type& __a = this->__alloc();
@@ -1618,6 +1618,7 @@ vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x)
16181618
__alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Up>(__x));
16191619
__v.__end_++;
16201620
__swap_out_circular_buffer(__v);
1621+
return this->__end_;
16211622
}
16221623

16231624
template <class _Tp, class _Allocator>
@@ -1626,12 +1627,14 @@ inline _LIBCPP_HIDE_FROM_ABI
16261627
void
16271628
vector<_Tp, _Allocator>::push_back(const_reference __x)
16281629
{
1629-
if (this->__end_ != this->__end_cap())
1630-
{
1630+
pointer __end = this->__end_;
1631+
if (__end < this->__end_cap()) {
16311632
__construct_one_at_end(__x);
1633+
++__end;
1634+
} else {
1635+
__end = __push_back_slow_path(__x);
16321636
}
1633-
else
1634-
__push_back_slow_path(__x);
1637+
this->__end_ = __end;
16351638
}
16361639

16371640
template <class _Tp, class _Allocator>
@@ -1640,18 +1643,20 @@ inline _LIBCPP_HIDE_FROM_ABI
16401643
void
16411644
vector<_Tp, _Allocator>::push_back(value_type&& __x)
16421645
{
1643-
if (this->__end_ < this->__end_cap())
1644-
{
1646+
pointer __end = this->__end_;
1647+
if (__end < this->__end_cap()) {
16451648
__construct_one_at_end(std::move(__x));
1649+
++__end;
1650+
} else {
1651+
__end = __push_back_slow_path(std::move(__x));
16461652
}
1647-
else
1648-
__push_back_slow_path(std::move(__x));
1653+
this->__end_ = __end;
16491654
}
16501655

16511656
template <class _Tp, class _Allocator>
16521657
template <class... _Args>
16531658
_LIBCPP_CONSTEXPR_SINCE_CXX20
1654-
void
1659+
typename vector<_Tp, _Allocator>::pointer
16551660
vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
16561661
{
16571662
allocator_type& __a = this->__alloc();
@@ -1660,6 +1665,7 @@ vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
16601665
__alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Args>(__args)...);
16611666
__v.__end_++;
16621667
__swap_out_circular_buffer(__v);
1668+
return this->__end_;
16631669
}
16641670

16651671
template <class _Tp, class _Allocator>
@@ -1673,14 +1679,16 @@ void
16731679
#endif
16741680
vector<_Tp, _Allocator>::emplace_back(_Args&&... __args)
16751681
{
1676-
if (this->__end_ < this->__end_cap())
1677-
{
1682+
pointer __end = this->__end_;
1683+
if (__end < this->__end_cap()) {
16781684
__construct_one_at_end(std::forward<_Args>(__args)...);
1685+
++__end;
1686+
} else {
1687+
__end = __emplace_back_slow_path(std::forward<_Args>(__args)...);
16791688
}
1680-
else
1681-
__emplace_back_slow_path(std::forward<_Args>(__args)...);
1689+
this->__end_ = __end;
16821690
#if _LIBCPP_STD_VER >= 17
1683-
return this->back();
1691+
return *(__end - 1);
16841692
#endif
16851693
}
16861694

0 commit comments

Comments
 (0)