Skip to content

Commit d447427

Browse files
committed
Auto merge of #124780 - Mark-Simulacrum:lockless-cache, r=<try>
Improve VecCache under parallel frontend This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (~1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds).
2 parents 503dfcf + a28ef6f commit d447427

File tree

3 files changed

+277
-33
lines changed

3 files changed

+277
-33
lines changed

compiler/rustc_query_system/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#![feature(hash_raw_entry)]
44
#![feature(min_specialization)]
55
#![feature(let_chains)]
6+
#![feature(dropck_eyepatch)]
67
#![allow(rustc::potential_query_instability, internal_features)]
78

89
pub mod cache;

compiler/rustc_query_system/src/query/caches.rs

+233-33
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@ use crate::dep_graph::DepNodeIndex;
22

33
use rustc_data_structures::fx::FxHashMap;
44
use rustc_data_structures::sharded::{self, Sharded};
5-
use rustc_data_structures::sync::{Lock, OnceLock};
5+
use rustc_data_structures::sync::{AtomicUsize, OnceLock};
66
use rustc_hir::def_id::LOCAL_CRATE;
7-
use rustc_index::{Idx, IndexVec};
7+
use rustc_index::Idx;
88
use rustc_span::def_id::DefId;
99
use rustc_span::def_id::DefIndex;
1010
use std::fmt::Debug;
1111
use std::hash::Hash;
12+
use std::marker::PhantomData;
13+
use std::mem::offset_of;
14+
use std::sync::atomic::{AtomicPtr, AtomicU32, Ordering};
1215

1316
pub trait QueryCache: Sized {
1417
type Key: Hash + Eq + Copy + Debug;
@@ -100,13 +103,201 @@ where
100103
}
101104
}
102105

103-
pub struct VecCache<K: Idx, V> {
104-
cache: Lock<IndexVec<K, Option<(V, DepNodeIndex)>>>,
106+
struct Slot<V> {
107+
// We never construct &Slot<V> so it's fine for this to not be in an UnsafeCell.
108+
value: V,
109+
// This is both an index and a once-lock.
110+
//
111+
// 0: not yet initialized.
112+
// 1: lock held, initializing.
113+
// 2..u32::MAX - 2: initialized.
114+
index_and_lock: AtomicU32,
105115
}
106116

107117
impl<K: Idx, V> Default for VecCache<K, V> {
108118
fn default() -> Self {
109-
VecCache { cache: Default::default() }
119+
VecCache {
120+
buckets: Default::default(),
121+
key: PhantomData,
122+
len: Default::default(),
123+
present: Default::default(),
124+
}
125+
}
126+
}
127+
128+
#[derive(Copy, Clone, Debug)]
129+
struct SlotIndex {
130+
bucket_idx: usize,
131+
entries: usize,
132+
index_in_bucket: usize,
133+
}
134+
135+
impl SlotIndex {
136+
#[inline]
137+
fn from_index(idx: u32) -> Self {
138+
let mut bucket = idx.checked_ilog2().unwrap_or(0) as usize;
139+
let entries;
140+
let running_sum;
141+
if bucket <= 11 {
142+
entries = 1 << 12;
143+
running_sum = 0;
144+
bucket = 0;
145+
} else {
146+
entries = 1 << bucket;
147+
running_sum = entries;
148+
bucket = bucket - 11;
149+
}
150+
SlotIndex { bucket_idx: bucket, entries, index_in_bucket: idx as usize - running_sum }
151+
}
152+
153+
#[inline]
154+
unsafe fn get<V: Copy>(&self, buckets: &[AtomicPtr<Slot<V>>; 21]) -> Option<(V, u32)> {
155+
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e.,
156+
// in-bounds of buckets. See `from_index` for computation.
157+
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) };
158+
let ptr = bucket.load(Ordering::Acquire);
159+
// Bucket is not yet initialized: then we obviously won't find this entry in that bucket.
160+
if ptr.is_null() {
161+
return None;
162+
}
163+
// SAFETY: Follows from preconditions on `buckets` and `self`.
164+
let slot = unsafe { ptr.add(self.index_in_bucket) };
165+
166+
// SAFETY:
167+
let index_and_lock =
168+
unsafe { &*slot.byte_add(offset_of!(Slot<V>, index_and_lock)).cast::<AtomicU32>() };
169+
let current = index_and_lock.load(Ordering::Acquire);
170+
let index = match current {
171+
0 => return None,
172+
// Treat "initializing" as actually just not initialized at all.
173+
// The only reason this is a separate state is that `complete` calls could race and
174+
// we can't allow that, but from load perspective there's no difference.
175+
1 => return None,
176+
_ => current - 2,
177+
};
178+
179+
// SAFETY:
180+
// * slot is a valid pointer (buckets are always valid for the index we get).
181+
// * value is initialized since we saw a >= 2 index above.
182+
// * `V: Copy`, so safe to read.
183+
let value = unsafe { slot.byte_add(offset_of!(Slot<V>, value)).cast::<V>().read() };
184+
Some((value, index))
185+
}
186+
187+
/// Returns true if this successfully put into the map.
188+
#[inline]
189+
fn put<V>(&self, buckets: &[AtomicPtr<Slot<V>>; 21], value: V, extra: u32) -> bool {
190+
static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
191+
192+
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e.,
193+
// in-bounds of buckets.
194+
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) };
195+
let mut ptr = bucket.load(Ordering::Acquire);
196+
let _allocator_guard;
197+
if ptr.is_null() {
198+
// If we load null, then acquire the global lock; this path is quite cold, so it's cheap
199+
// to use a global lock.
200+
_allocator_guard = LOCK.lock();
201+
// And re-load the value.
202+
ptr = bucket.load(Ordering::Acquire);
203+
}
204+
205+
// OK, now under the allocator lock, if we're still null then it's definitely us that will
206+
// initialize this bucket.
207+
if ptr.is_null() {
208+
let bucket_layout =
209+
std::alloc::Layout::array::<Slot<V>>(self.entries as usize).unwrap();
210+
// SAFETY: Always >0 entries in each bucket.
211+
let allocated = unsafe { std::alloc::alloc_zeroed(bucket_layout).cast::<Slot<V>>() };
212+
if allocated.is_null() {
213+
std::alloc::handle_alloc_error(bucket_layout);
214+
}
215+
bucket.store(allocated, Ordering::Release);
216+
ptr = allocated;
217+
}
218+
assert!(!ptr.is_null());
219+
220+
// SAFETY: `index_in_bucket` is always in-bounds of the allocated array.
221+
let slot = unsafe { ptr.add(self.index_in_bucket) };
222+
223+
let index_and_lock =
224+
unsafe { &*slot.byte_add(offset_of!(Slot<V>, index_and_lock)).cast::<AtomicU32>() };
225+
match index_and_lock.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed) {
226+
Ok(_) => {
227+
// We have acquired the initialization lock. It is our job to write `value` and
228+
// then set the lock to the real index.
229+
230+
unsafe {
231+
slot.byte_add(offset_of!(Slot<V>, value)).cast::<V>().write(value);
232+
}
233+
234+
index_and_lock.store(extra.checked_add(2).unwrap(), Ordering::Release);
235+
236+
true
237+
}
238+
239+
// Treat "initializing" as actually initialized: we lost the race and should skip
240+
// any updates to this slot. In practice this should be unreachable since we're guarded
241+
// by an external lock that only allows one initialization for any given query result.
242+
Err(1) => unreachable!(),
243+
244+
// This slot was already populated. Also ignore, currently this is the same as
245+
// "initializing".
246+
Err(_) => false,
247+
}
248+
}
249+
}
250+
251+
pub struct VecCache<K: Idx, V> {
252+
// Entries per bucket:
253+
// Bucket 0: 4096 2^12
254+
// Bucket 1: 4096 2^12
255+
// Bucket 2: 8192
256+
// Bucket 3: 16384
257+
// ...
258+
// Bucket 19: 1073741824
259+
// Bucket 20: 2147483648
260+
// The total number of entries if all buckets are initialized is u32::MAX-1.
261+
buckets: [AtomicPtr<Slot<V>>; 21],
262+
263+
// Present and len are only used during incremental and self-profiling.
264+
// They are an optimization over iterating the full buckets array.
265+
present: [AtomicPtr<Slot<()>>; 21],
266+
len: AtomicUsize,
267+
268+
key: PhantomData<K>,
269+
}
270+
271+
// SAFETY: No access to `V` is made.
272+
unsafe impl<K: Idx, #[may_dangle] V> Drop for VecCache<K, V> {
273+
fn drop(&mut self) {
274+
// We have unique ownership, so no locks etc. are needed. Since `K` and `V` are both `Copy`,
275+
// we are also guaranteed to just need to deallocate any large arrays (not iterate over
276+
// contents).
277+
278+
let mut entries = 1 << 12;
279+
for bucket in self.buckets.iter() {
280+
let bucket = bucket.load(Ordering::Acquire);
281+
if !bucket.is_null() {
282+
let layout = std::alloc::Layout::array::<Slot<V>>(entries).unwrap();
283+
unsafe {
284+
std::alloc::dealloc(bucket.cast(), layout);
285+
}
286+
}
287+
entries *= 2;
288+
}
289+
290+
let mut entries = 1 << 12;
291+
for bucket in self.present.iter() {
292+
let bucket = bucket.load(Ordering::Acquire);
293+
if !bucket.is_null() {
294+
let layout = std::alloc::Layout::array::<Slot<V>>(entries).unwrap();
295+
unsafe {
296+
std::alloc::dealloc(bucket.cast(), layout);
297+
}
298+
}
299+
entries *= 2;
300+
}
110301
}
111302
}
112303

@@ -120,20 +311,41 @@ where
120311

121312
#[inline(always)]
122313
fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> {
123-
let lock = self.cache.lock();
124-
if let Some(Some(value)) = lock.get(*key) { Some(*value) } else { None }
314+
let key = u32::try_from(key.index()).unwrap();
315+
let slot_idx = SlotIndex::from_index(key);
316+
match unsafe { slot_idx.get(&self.buckets) } {
317+
Some((value, idx)) => Some((value, DepNodeIndex::from_u32(idx))),
318+
None => None,
319+
}
125320
}
126321

127322
#[inline]
128323
fn complete(&self, key: K, value: V, index: DepNodeIndex) {
129-
let mut lock = self.cache.lock();
130-
lock.insert(key, (value, index));
324+
let key = u32::try_from(key.index()).unwrap();
325+
let slot_idx = SlotIndex::from_index(key);
326+
if slot_idx.put(&self.buckets, value, index.index() as u32) {
327+
let present_idx = self.len.fetch_add(1, Ordering::Relaxed);
328+
let slot = SlotIndex::from_index(present_idx as u32);
329+
// We should always be uniquely putting due to `len` fetch_add returning unique values.
330+
assert!(slot.put(&self.present, (), key));
331+
}
131332
}
132333

133334
fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
134-
for (k, v) in self.cache.lock().iter_enumerated() {
135-
if let Some(v) = v {
136-
f(&k, &v.0, v.1);
335+
for idx in 0..self.len.load(Ordering::Relaxed) {
336+
let key = SlotIndex::from_index(idx as u32);
337+
match unsafe { key.get(&self.present) } {
338+
// This shouldn't happen in our current usage (iter is really only
339+
// used long after queries are done running), but if we hit this in practice it's
340+
// probably fine to just break early.
341+
None => unreachable!(),
342+
Some(((), key)) => {
343+
let key = K::new(key as usize);
344+
// unwrap() is OK: present entries are always written only after we put the real
345+
// entry.
346+
let value = self.lookup(&key).unwrap();
347+
f(&key, &value.0, value.1);
348+
}
137349
}
138350
}
139351
}
@@ -142,10 +354,7 @@ where
142354
pub struct DefIdCache<V> {
143355
/// Stores the local DefIds in a dense map. Local queries are much more often dense, so this is
144356
/// a win over hashing query keys at marginal memory cost (~5% at most) compared to FxHashMap.
145-
///
146-
/// The second element of the tuple is the set of keys actually present in the IndexVec, used
147-
/// for faster iteration in `iter()`.
148-
local: Lock<(IndexVec<DefIndex, Option<(V, DepNodeIndex)>>, Vec<DefIndex>)>,
357+
local: VecCache<DefIndex, V>,
149358
foreign: DefaultCache<DefId, V>,
150359
}
151360

@@ -165,8 +374,7 @@ where
165374
#[inline(always)]
166375
fn lookup(&self, key: &DefId) -> Option<(V, DepNodeIndex)> {
167376
if key.krate == LOCAL_CRATE {
168-
let cache = self.local.lock();
169-
cache.0.get(key.index).and_then(|v| *v)
377+
self.local.lookup(&key.index)
170378
} else {
171379
self.foreign.lookup(key)
172380
}
@@ -175,27 +383,19 @@ where
175383
#[inline]
176384
fn complete(&self, key: DefId, value: V, index: DepNodeIndex) {
177385
if key.krate == LOCAL_CRATE {
178-
let mut cache = self.local.lock();
179-
let (cache, present) = &mut *cache;
180-
let slot = cache.ensure_contains_elem(key.index, Default::default);
181-
if slot.is_none() {
182-
// FIXME: Only store the present set when running in incremental mode. `iter` is not
183-
// used outside of saving caches to disk and self-profile.
184-
present.push(key.index);
185-
}
186-
*slot = Some((value, index));
386+
self.local.complete(key.index, value, index)
187387
} else {
188388
self.foreign.complete(key, value, index)
189389
}
190390
}
191391

192392
fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
193-
let guard = self.local.lock();
194-
let (cache, present) = &*guard;
195-
for &idx in present.iter() {
196-
let value = cache[idx].unwrap();
197-
f(&DefId { krate: LOCAL_CRATE, index: idx }, &value.0, value.1);
198-
}
393+
self.local.iter(&mut |key, value, index| {
394+
f(&DefId { krate: LOCAL_CRATE, index: *key }, value, index);
395+
});
199396
self.foreign.iter(f);
200397
}
201398
}
399+
400+
#[cfg(test)]
401+
mod tests;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
use super::*;
2+
3+
#[test]
4+
fn vec_cache_empty() {
5+
let cache: VecCache<u32, u32> = VecCache::default();
6+
for key in 0..=u32::MAX {
7+
assert!(cache.lookup(&key).is_none());
8+
}
9+
}
10+
11+
#[test]
12+
fn vec_cache_insert_and_check() {
13+
let cache: VecCache<u32, u32> = VecCache::default();
14+
cache.complete(0, 1, DepNodeIndex::ZERO);
15+
assert_eq!(cache.lookup(&0), Some((1, DepNodeIndex::ZERO)));
16+
}
17+
18+
#[test]
19+
fn slot_index_exhaustive() {
20+
let mut buckets = [0u32; 21];
21+
for idx in 0..=u32::MAX {
22+
buckets[SlotIndex::from_index(idx).bucket_idx] += 1;
23+
}
24+
let mut prev = None::<SlotIndex>;
25+
for idx in 0..u32::MAX {
26+
let slot_idx = SlotIndex::from_index(idx);
27+
if let Some(p) = prev {
28+
if p.bucket_idx == slot_idx.bucket_idx {
29+
assert_eq!(p.index_in_bucket + 1, slot_idx.index_in_bucket);
30+
} else {
31+
assert_eq!(slot_idx.index_in_bucket, 0);
32+
}
33+
} else {
34+
assert_eq!(idx, 0);
35+
assert_eq!(slot_idx.index_in_bucket, 0);
36+
assert_eq!(slot_idx.bucket_idx, 0);
37+
}
38+
39+
assert_eq!(buckets[slot_idx.bucket_idx], slot_idx.entries as u32);
40+
41+
prev = Some(slot_idx);
42+
}
43+
}

0 commit comments

Comments
 (0)