Skip to content

Commit 1961749

Browse files
Improve VecCache under parallel frontend
This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (<0.1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds).
1 parent b73478b commit 1961749

File tree

6 files changed

+484
-65
lines changed

6 files changed

+484
-65
lines changed

compiler/rustc_data_structures/src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#![feature(auto_traits)]
2323
#![feature(cfg_match)]
2424
#![feature(core_intrinsics)]
25+
#![feature(dropck_eyepatch)]
2526
#![feature(extend_one)]
2627
#![feature(file_buffered)]
2728
#![feature(hash_raw_entry)]
@@ -79,6 +80,7 @@ pub mod thinvec;
7980
pub mod transitive_relation;
8081
pub mod unhash;
8182
pub mod unord;
83+
pub mod vec_cache;
8284
pub mod work_queue;
8385

8486
mod atomic_ref;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
//! VecCache maintains a mapping from K -> (V, I) pairing. K and I must be roughly u32-sized, and V
2+
//! must be Copy.
3+
//!
4+
//! VecCache supports efficient concurrent put/get across the key space, with write-once semantics
5+
//! (i.e., a given key can only be put once). Subsequent puts will panic.
6+
//!
7+
//! This is currently used for query caching.
8+
9+
use std::fmt::Debug;
10+
use std::marker::PhantomData;
11+
use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicUsize, Ordering};
12+
13+
use rustc_index::Idx;
14+
15+
struct Slot<V> {
16+
// We never construct &Slot<V> so it's fine for this to not be in an UnsafeCell.
17+
value: V,
18+
// This is both an index and a once-lock.
19+
//
20+
// 0: not yet initialized.
21+
// 1: lock held, initializing.
22+
// 2..u32::MAX - 2: initialized.
23+
index_and_lock: AtomicU32,
24+
}
25+
26+
/// This uniquely identifies a single `Slot<V>` entry in the buckets map, and provides accessors for
27+
/// either getting the value or putting a value.
28+
#[derive(Copy, Clone, Debug)]
29+
struct SlotIndex<V> {
30+
// the index of the bucket in VecCache (0 to 20)
31+
bucket_idx: usize,
32+
// number of entries in that bucket
33+
entries: usize,
34+
// the index of the slot within the bucket
35+
index_in_bucket: usize,
36+
37+
bucket_ty: PhantomData<V>,
38+
}
39+
40+
impl<V> SlotIndex<V> {
41+
// This makes sure the counts are consistent with what we allocate, precomputing each bucket a
42+
// compile-time. Visiting all powers of two is enough to hit all the buckets.
43+
//
44+
// We confirm counts are accurate in the slot_index_exhaustive test.
45+
const ENTRIES_BY_BUCKET: [usize; 21] = {
46+
let mut entries = [0; 21];
47+
let mut key = 0;
48+
loop {
49+
let si = SlotIndex::<V>::from_index(key);
50+
entries[si.bucket_idx] = si.entries;
51+
if key == 0 {
52+
key = 1;
53+
} else if key == (1 << 31) {
54+
break;
55+
} else {
56+
key <<= 1;
57+
}
58+
}
59+
entries
60+
};
61+
62+
// This unpacks a flat u32 index into identifying which bucket it belongs to and the offset
63+
// within that bucket. As noted in the VecCache docs, buckets double in size with each index.
64+
// Typically that would mean 31 buckets (2^0 + 2^1 ... + 2^31 = u32::MAX - 1), but to reduce
65+
// the size of the VecCache struct and avoid uselessly small allocations, we instead have the
66+
// first bucket have 2**12 entries. To simplify the math, the second bucket also 2**12 entries,
67+
// and buckets double from there.
68+
//
69+
// We assert that [0, 2**32 - 1] uniquely map through this function to individual, consecutive
70+
// slots (see `slot_index_exhaustive` in tests). Note that in practice this mapping truncates
71+
// upper indices on 32-bit systems, since each entry is at least 4 bytes and so we don't have
72+
// enough address space to store the whole range.
73+
#[inline]
74+
const fn from_index(idx: u32) -> Self {
75+
let mut bucket = match idx.checked_ilog2() {
76+
Some(x) => x as usize,
77+
None => 0,
78+
};
79+
let entries;
80+
let running_sum;
81+
if bucket <= 11 {
82+
entries = 1 << 12;
83+
running_sum = 0;
84+
bucket = 0;
85+
} else {
86+
entries = 1 << bucket;
87+
running_sum = entries;
88+
bucket = bucket - 11;
89+
}
90+
let max = match (isize::MAX as usize).checked_div(std::mem::size_of::<Slot<V>>()) {
91+
Some(v) => v,
92+
None => isize::MAX as usize,
93+
};
94+
SlotIndex {
95+
bucket_idx: bucket,
96+
entries: if running_sum > max {
97+
// no entries if we already exceeded (in total) allocating the full address space
98+
// note that technically this means indexing into the end of the range already
99+
// fails on 32-bit systems, even if technically you could allocate just the last
100+
// bucket.
101+
0
102+
} else if entries > max {
103+
max
104+
} else {
105+
entries
106+
},
107+
index_in_bucket: idx as usize - running_sum,
108+
bucket_ty: PhantomData,
109+
}
110+
}
111+
112+
// SAFETY: Buckets must be managed solely by functions here (i.e., get/put on SlotIndex) and
113+
// `self` comes from SlotIndex::from_index
114+
#[inline]
115+
unsafe fn get(&self, buckets: &[AtomicPtr<Slot<V>>; 21]) -> Option<(V, u32)>
116+
where
117+
V: Copy,
118+
{
119+
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e.,
120+
// in-bounds of buckets. See `from_index` for computation.
121+
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) };
122+
let ptr = bucket.load(Ordering::Acquire);
123+
// Bucket is not yet initialized: then we obviously won't find this entry in that bucket.
124+
if ptr.is_null() {
125+
return None;
126+
}
127+
assert!(self.index_in_bucket < self.entries);
128+
// SAFETY: `bucket` was allocated to hold `entries`, so this must be inbounds.
129+
let slot = unsafe { ptr.add(self.index_in_bucket) };
130+
131+
// SAFETY: initialized bucket has zeroed all memory within the bucket, so we are valid for
132+
// AtomicU32 access.
133+
let index_and_lock = unsafe { &(*slot).index_and_lock };
134+
let current = index_and_lock.load(Ordering::Acquire);
135+
let index = match current {
136+
0 => return None,
137+
// Treat "initializing" as actually just not initialized at all.
138+
// The only reason this is a separate state is that `complete` calls could race and
139+
// we can't allow that, but from load perspective there's no difference.
140+
1 => return None,
141+
_ => current - 2,
142+
};
143+
144+
// SAFETY:
145+
// * slot is a valid pointer (buckets are always valid for the index we get).
146+
// * value is initialized since we saw a >= 2 index above.
147+
// * `V: Copy`, so safe to read.
148+
let value = unsafe { (*slot).value };
149+
Some((value, index))
150+
}
151+
152+
fn bucket_ptr(&self, bucket: &AtomicPtr<Slot<V>>) -> *mut Slot<V> {
153+
let ptr = bucket.load(Ordering::Acquire);
154+
if ptr.is_null() { self.initialize_bucket(bucket) } else { ptr }
155+
}
156+
157+
#[cold]
158+
fn initialize_bucket(&self, bucket: &AtomicPtr<Slot<V>>) -> *mut Slot<V> {
159+
static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
160+
161+
// If we are initializing the bucket, then acquire a global lock.
162+
//
163+
// This path is quite cold, so it's cheap to use a global lock. This ensures that we never
164+
// have multiple allocations for the same bucket.
165+
let _allocator_guard = LOCK.lock().unwrap_or_else(|e| e.into_inner());
166+
167+
let ptr = bucket.load(Ordering::Acquire);
168+
169+
// OK, now under the allocator lock, if we're still null then it's definitely us that will
170+
// initialize this bucket.
171+
if ptr.is_null() {
172+
let bucket_layout =
173+
std::alloc::Layout::array::<Slot<V>>(self.entries as usize).unwrap();
174+
// SAFETY: Always >0 entries in each bucket.
175+
let allocated = unsafe { std::alloc::alloc_zeroed(bucket_layout).cast::<Slot<V>>() };
176+
if allocated.is_null() {
177+
std::alloc::handle_alloc_error(bucket_layout);
178+
}
179+
bucket.store(allocated, Ordering::Release);
180+
allocated
181+
} else {
182+
// Otherwise some other thread initialized this bucket after we took the lock. In that
183+
// case, just return early.
184+
ptr
185+
}
186+
}
187+
188+
/// Returns true if this successfully put into the map.
189+
#[inline]
190+
fn put(&self, buckets: &[AtomicPtr<Slot<V>>; 21], value: V, extra: u32) -> bool {
191+
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e.,
192+
// in-bounds of buckets.
193+
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) };
194+
let ptr = self.bucket_ptr(bucket);
195+
196+
assert!(self.index_in_bucket < self.entries);
197+
// SAFETY: `bucket` was allocated to hold `entries`, so this must be inbounds.
198+
let slot = unsafe { ptr.add(self.index_in_bucket) };
199+
200+
// SAFETY: initialized bucket has zeroed all memory within the bucket, so we are valid for
201+
// AtomicU32 access.
202+
let index_and_lock = unsafe { &(*slot).index_and_lock };
203+
match index_and_lock.compare_exchange(0, 1, Ordering::AcqRel, Ordering::Acquire) {
204+
Ok(_) => {
205+
// We have acquired the initialization lock. It is our job to write `value` and
206+
// then set the lock to the real index.
207+
208+
unsafe {
209+
(&raw mut (*slot).value).write(value);
210+
}
211+
212+
index_and_lock.store(extra.checked_add(2).unwrap(), Ordering::Release);
213+
214+
true
215+
}
216+
217+
// Treat "initializing" as the caller's fault. Callers are responsible for ensuring that
218+
// there are no races on initialization. In the compiler's current usage for query
219+
// caches, that's the "active query map" which ensures each query actually runs once
220+
// (even if concurrently started).
221+
Err(1) => panic!("caller raced calls to put()"),
222+
223+
// This slot was already populated. Also ignore, currently this is the same as
224+
// "initializing".
225+
Err(_) => false,
226+
}
227+
}
228+
}
229+
230+
pub struct VecCache<K: Idx, V, I> {
231+
// Entries per bucket:
232+
// Bucket 0: 4096 2^12
233+
// Bucket 1: 4096 2^12
234+
// Bucket 2: 8192
235+
// Bucket 3: 16384
236+
// ...
237+
// Bucket 19: 1073741824
238+
// Bucket 20: 2147483648
239+
// The total number of entries if all buckets are initialized is u32::MAX-1.
240+
buckets: [AtomicPtr<Slot<V>>; 21],
241+
242+
// In the compiler's current usage these are only *read* during incremental and self-profiling.
243+
// They are an optimization over iterating the full buckets array.
244+
present: [AtomicPtr<Slot<()>>; 21],
245+
len: AtomicUsize,
246+
247+
key: PhantomData<(K, I)>,
248+
}
249+
250+
impl<K: Idx, V, I> Default for VecCache<K, V, I> {
251+
fn default() -> Self {
252+
VecCache {
253+
buckets: Default::default(),
254+
key: PhantomData,
255+
len: Default::default(),
256+
present: Default::default(),
257+
}
258+
}
259+
}
260+
261+
// SAFETY: No access to `V` is made.
262+
unsafe impl<K: Idx, #[may_dangle] V, I> Drop for VecCache<K, V, I> {
263+
fn drop(&mut self) {
264+
// We have unique ownership, so no locks etc. are needed. Since `K` and `V` are both `Copy`,
265+
// we are also guaranteed to just need to deallocate any large arrays (not iterate over
266+
// contents).
267+
//
268+
// Confirm no need to deallocate invidual entries. Note that `V: Copy` is asserted on
269+
// insert/lookup but not necessarily construction, primarily to avoid annoyingly propagating
270+
// the bounds into struct definitions everywhere.
271+
assert!(!std::mem::needs_drop::<K>());
272+
assert!(!std::mem::needs_drop::<V>());
273+
274+
for (idx, bucket) in self.buckets.iter().enumerate() {
275+
let bucket = bucket.load(Ordering::Acquire);
276+
if !bucket.is_null() {
277+
let layout =
278+
std::alloc::Layout::array::<Slot<V>>(SlotIndex::<V>::ENTRIES_BY_BUCKET[idx])
279+
.unwrap();
280+
unsafe {
281+
std::alloc::dealloc(bucket.cast(), layout);
282+
}
283+
}
284+
}
285+
286+
for (idx, bucket) in self.present.iter().enumerate() {
287+
let bucket = bucket.load(Ordering::Acquire);
288+
if !bucket.is_null() {
289+
let layout =
290+
std::alloc::Layout::array::<Slot<()>>(SlotIndex::<()>::ENTRIES_BY_BUCKET[idx])
291+
.unwrap();
292+
unsafe {
293+
std::alloc::dealloc(bucket.cast(), layout);
294+
}
295+
}
296+
}
297+
}
298+
}
299+
300+
impl<K, V, I> VecCache<K, V, I>
301+
where
302+
K: Eq + Idx + Copy + Debug,
303+
V: Copy,
304+
I: Idx + Copy,
305+
{
306+
#[inline(always)]
307+
pub fn lookup(&self, key: &K) -> Option<(V, I)> {
308+
let key = u32::try_from(key.index()).unwrap();
309+
let slot_idx = SlotIndex::from_index(key);
310+
match unsafe { slot_idx.get(&self.buckets) } {
311+
Some((value, idx)) => Some((value, I::new(idx as usize))),
312+
None => None,
313+
}
314+
}
315+
316+
#[inline]
317+
pub fn complete(&self, key: K, value: V, index: I) {
318+
let key = u32::try_from(key.index()).unwrap();
319+
let slot_idx = SlotIndex::from_index(key);
320+
if slot_idx.put(&self.buckets, value, index.index() as u32) {
321+
let present_idx = self.len.fetch_add(1, Ordering::Relaxed);
322+
let slot = SlotIndex::from_index(present_idx as u32);
323+
// We should always be uniquely putting due to `len` fetch_add returning unique values.
324+
assert!(slot.put(&self.present, (), key));
325+
}
326+
}
327+
328+
pub fn iter(&self, f: &mut dyn FnMut(&K, &V, I)) {
329+
for idx in 0..self.len.load(Ordering::Acquire) {
330+
let key = SlotIndex::from_index(idx as u32);
331+
match unsafe { key.get(&self.present) } {
332+
// This shouldn't happen in our current usage (iter is really only
333+
// used long after queries are done running), but if we hit this in practice it's
334+
// probably fine to just break early.
335+
None => unreachable!(),
336+
Some(((), key)) => {
337+
let key = K::new(key as usize);
338+
// unwrap() is OK: present entries are always written only after we put the real
339+
// entry.
340+
let value = self.lookup(&key).unwrap();
341+
f(&key, &value.0, value.1);
342+
}
343+
}
344+
}
345+
}
346+
}
347+
348+
#[cfg(test)]
349+
mod tests;

0 commit comments

Comments
 (0)