Skip to content

Commit 8ac43e2

Browse files
committed
Implement rough symbol interning infra
1 parent 7fa20cb commit 8ac43e2

File tree

6 files changed

+541
-1
lines changed

6 files changed

+541
-1
lines changed

.typos.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ extend-ignore-re = [
1414
"\\w*\\.{3,4}\\w*",
1515
'"flate2"',
1616
"raison d'être",
17+
"inout",
18+
"optin"
1719
]
1820

1921
[default.extend-words]

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/intern/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dashmap.workspace = true
1818
hashbrown.workspace = true
1919
rustc-hash.workspace = true
2020
triomphe.workspace = true
21+
sptr = "0.3.2"
2122

2223
[lints]
23-
workspace = true
24+
workspace = true

crates/intern/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ type Guard<T> = dashmap::RwLockWriteGuard<
2020
HashMap<Arc<T>, SharedValue<()>, BuildHasherDefault<FxHasher>>,
2121
>;
2222

23+
mod symbol;
24+
pub use self::symbol::{symbols, Symbol};
25+
2326
pub struct Interned<T: Internable + ?Sized> {
2427
arc: Arc<T>,
2528
}

crates/intern/src/symbol.rs

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
//! Attempt at flexible symbol interning, allowing to intern and free strings at runtime while also
2+
//! supporting
3+
4+
use std::{
5+
borrow::Borrow,
6+
fmt,
7+
hash::{BuildHasherDefault, Hash, Hasher},
8+
mem,
9+
ptr::NonNull,
10+
sync::OnceLock,
11+
};
12+
13+
use dashmap::{DashMap, SharedValue};
14+
use hashbrown::{hash_map::RawEntryMut, HashMap};
15+
use rustc_hash::FxHasher;
16+
use sptr::Strict;
17+
use triomphe::Arc;
18+
19+
pub mod symbols;
20+
21+
const _: () = assert!(std::mem::size_of::<Box<str>>() == std::mem::size_of::<&str>());
22+
const _: () = assert!(std::mem::align_of::<Box<str>>() == std::mem::align_of::<&str>());
23+
24+
const _: () = assert!(std::mem::size_of::<Arc<Box<str>>>() == std::mem::size_of::<&&str>());
25+
const _: () = assert!(std::mem::align_of::<Arc<Box<str>>>() == std::mem::align_of::<&&str>());
26+
27+
#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
28+
struct TaggedArcPtr {
29+
packed: NonNull<*const str>,
30+
}
31+
32+
unsafe impl Send for TaggedArcPtr {}
33+
unsafe impl Sync for TaggedArcPtr {}
34+
35+
impl TaggedArcPtr {
36+
const BOOL_BITS: usize = true as usize;
37+
38+
const fn non_arc(r: &&str) -> Self {
39+
Self {
40+
// SAFETY: The pointer is non-null as it is derived from a reference
41+
// Additionally, the tag is empty as we aren't encoding an arc, so we do not need to
42+
// handle tagging here. (we couldn't do it if we wanted to as we are in a const context)
43+
packed: unsafe {
44+
NonNull::new_unchecked((r as *const &str).cast::<*const str>().cast_mut())
45+
},
46+
}
47+
}
48+
49+
fn arc(arc: Arc<Box<str>>) -> Self {
50+
Self {
51+
packed: Self::pack_arc(
52+
// Safety: `Arc::into_raw`` always returns a non null pointer
53+
unsafe { NonNull::new_unchecked(Arc::into_raw(arc).cast_mut().cast()) },
54+
),
55+
}
56+
}
57+
58+
/// Retrieves the tag.
59+
#[inline]
60+
pub(crate) fn try_as_arc_owned(self) -> Option<Arc<Box<str>>> {
61+
// Unpack the tag, according to the `self.packed` encoding scheme
62+
let tag = Strict::addr(self.packed.as_ptr()) & Self::BOOL_BITS;
63+
if tag != 0 {
64+
// Safety: We checked that the tag is true, so we are pointing to the data offset of an `Arc`
65+
Some(unsafe { Arc::from_raw(self.pointer().as_ptr().cast::<Box<str>>()) })
66+
} else {
67+
None
68+
}
69+
}
70+
71+
#[inline]
72+
const fn pack_arc(ptr: NonNull<*const str>) -> NonNull<*const str> {
73+
let packed_tag = true as usize;
74+
75+
// can't use this due to trait methods not being const
76+
// unsafe {
77+
// // Safety: The pointer is derived from a non-null
78+
// NonNull::new_unchecked(Strict::map_addr(ptr.as_ptr(), |addr| {
79+
// // Safety:
80+
// // - The pointer is `NonNull` => it's address is `NonZero<usize>`
81+
// // - `P::BITS` least significant bits are always zero (`Pointer` contract)
82+
// // - `T::BITS <= P::BITS` (from `Self::ASSERTION`)
83+
// //
84+
// // Thus `addr >> T::BITS` is guaranteed to be non-zero.
85+
// //
86+
// // `{non_zero} | packed_tag` can't make the value zero.
87+
88+
// (addr >> Self::BOOL_BITS) | packed_tag
89+
// }))
90+
// }
91+
92+
let self_addr = unsafe { core::mem::transmute::<*const _, usize>(ptr.as_ptr()) };
93+
let addr = self_addr | packed_tag;
94+
95+
// FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
96+
//
97+
// In the mean-time, this operation is defined to be "as if" it was
98+
// a wrapping_offset, so we can emulate it as such. This should properly
99+
// restore pointer provenance even under today's compiler.
100+
let dest_addr = addr as isize;
101+
let offset = dest_addr.wrapping_sub(self_addr as isize);
102+
103+
unsafe { NonNull::new_unchecked(ptr.as_ptr().cast::<u8>().wrapping_offset(offset).cast()) }
104+
}
105+
106+
#[inline]
107+
pub(crate) fn pointer(self) -> NonNull<*const str> {
108+
// SAFETY: The resulting pointer is guaranteed to be NonNull as we only modify the niche bytes
109+
unsafe {
110+
NonNull::new_unchecked(Strict::map_addr(self.packed.as_ptr(), |addr| {
111+
addr & !Self::BOOL_BITS
112+
}))
113+
}
114+
}
115+
116+
#[inline]
117+
pub(crate) fn as_str(&self) -> &str {
118+
// SAFETY: We always point to a pointer to a str
119+
unsafe { *self.pointer().as_ptr().cast::<&str>() }
120+
}
121+
}
122+
123+
#[derive(PartialEq, Eq, Hash, Clone, Debug)]
124+
pub struct Symbol {
125+
repr: TaggedArcPtr,
126+
}
127+
const _: () = assert!(std::mem::size_of::<Symbol>() == std::mem::size_of::<NonNull<()>>());
128+
const _: () = assert!(std::mem::align_of::<Symbol>() == std::mem::align_of::<NonNull<()>>());
129+
130+
static MAP: OnceLock<DashMap<SymbolProxy, (), BuildHasherDefault<FxHasher>>> = OnceLock::new();
131+
132+
impl Symbol {
133+
pub fn intern(s: &str) -> Self {
134+
let (mut shard, hash) = Self::select_shard(s);
135+
// Atomically,
136+
// - check if `obj` is already in the map
137+
// - if so, clone its `Arc` and return it
138+
// - if not, box it up, insert it, and return a clone
139+
// This needs to be atomic (locking the shard) to avoid races with other thread, which could
140+
// insert the same object between us looking it up and inserting it.
141+
match shard.raw_entry_mut().from_key_hashed_nocheck(hash, s) {
142+
RawEntryMut::Occupied(occ) => Self { repr: increase_arc_refcount(occ.key().0) },
143+
RawEntryMut::Vacant(vac) => Self {
144+
repr: increase_arc_refcount(
145+
vac.insert_hashed_nocheck(
146+
hash,
147+
SymbolProxy(TaggedArcPtr::arc(Arc::new(Box::<str>::from(s)))),
148+
SharedValue::new(()),
149+
)
150+
.0
151+
.0,
152+
),
153+
},
154+
}
155+
}
156+
157+
pub fn as_str(&self) -> &str {
158+
self.repr.as_str()
159+
}
160+
161+
#[inline]
162+
fn select_shard(
163+
s: &str,
164+
) -> (
165+
dashmap::RwLockWriteGuard<
166+
'static,
167+
HashMap<SymbolProxy, SharedValue<()>, BuildHasherDefault<FxHasher>>,
168+
>,
169+
u64,
170+
) {
171+
let storage = MAP.get_or_init(symbols::prefill);
172+
let hash = {
173+
let mut hasher = std::hash::BuildHasher::build_hasher(storage.hasher());
174+
s.hash(&mut hasher);
175+
hasher.finish()
176+
};
177+
let shard_idx = storage.determine_shard(hash as usize);
178+
let shard = &storage.shards()[shard_idx];
179+
(shard.write(), hash)
180+
}
181+
182+
#[cold]
183+
fn drop_slow(arc: &Arc<Box<str>>) {
184+
let (mut shard, hash) = Self::select_shard(arc);
185+
186+
if Arc::count(arc) != 2 {
187+
// Another thread has interned another copy
188+
return;
189+
}
190+
191+
match shard.raw_entry_mut().from_key_hashed_nocheck::<str>(hash, arc.as_ref()) {
192+
RawEntryMut::Occupied(occ) => occ.remove_entry(),
193+
RawEntryMut::Vacant(_) => unreachable!(),
194+
}
195+
.0
196+
.0
197+
.try_as_arc_owned()
198+
.unwrap();
199+
200+
// Shrink the backing storage if the shard is less than 50% occupied.
201+
if shard.len() * 2 < shard.capacity() {
202+
shard.shrink_to_fit();
203+
}
204+
}
205+
}
206+
207+
impl Drop for Symbol {
208+
#[inline]
209+
fn drop(&mut self) {
210+
let Some(arc) = self.repr.try_as_arc_owned() else {
211+
return;
212+
};
213+
// When the last `Ref` is dropped, remove the object from the global map.
214+
if Arc::count(&arc) == 2 {
215+
// Only `self` and the global map point to the object.
216+
217+
Self::drop_slow(&arc);
218+
}
219+
// decrement the ref count
220+
drop(arc);
221+
}
222+
}
223+
224+
fn increase_arc_refcount(repr: TaggedArcPtr) -> TaggedArcPtr {
225+
let Some(arc) = repr.try_as_arc_owned() else {
226+
return repr;
227+
};
228+
// increase the ref count
229+
mem::forget(arc.clone());
230+
mem::forget(arc);
231+
repr
232+
}
233+
234+
impl fmt::Display for Symbol {
235+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
236+
self.as_str().fmt(f)
237+
}
238+
}
239+
240+
// only exists so we can use `from_key_hashed_nocheck` with a &str
241+
#[derive(Debug, PartialEq, Eq)]
242+
struct SymbolProxy(TaggedArcPtr);
243+
244+
impl Hash for SymbolProxy {
245+
fn hash<H: Hasher>(&self, state: &mut H) {
246+
self.0.as_str().hash(state);
247+
}
248+
}
249+
250+
impl Borrow<str> for SymbolProxy {
251+
fn borrow(&self) -> &str {
252+
self.0.as_str()
253+
}
254+
}
255+
256+
#[cfg(test)]
257+
mod tests {
258+
use super::*;
259+
260+
#[test]
261+
fn smoke_test() {
262+
Symbol::intern("isize");
263+
let base_len = MAP.get().unwrap().len();
264+
let hello = Symbol::intern("hello");
265+
let world = Symbol::intern("world");
266+
let bang = Symbol::intern("!");
267+
let q = Symbol::intern("?");
268+
assert_eq!(MAP.get().unwrap().len(), base_len + 4);
269+
let bang2 = Symbol::intern("!");
270+
assert_eq!(MAP.get().unwrap().len(), base_len + 4);
271+
drop(bang2);
272+
assert_eq!(MAP.get().unwrap().len(), base_len + 4);
273+
drop(q);
274+
assert_eq!(MAP.get().unwrap().len(), base_len + 3);
275+
let default = Symbol::intern("default");
276+
assert_eq!(MAP.get().unwrap().len(), base_len + 3);
277+
assert_eq!(
278+
"hello default world!",
279+
format!("{} {} {}{}", hello.as_str(), default.as_str(), world.as_str(), bang.as_str())
280+
);
281+
drop(default);
282+
assert_eq!(
283+
"hello world!",
284+
format!("{} {}{}", hello.as_str(), world.as_str(), bang.as_str())
285+
);
286+
drop(hello);
287+
drop(world);
288+
drop(bang);
289+
assert_eq!(MAP.get().unwrap().len(), base_len);
290+
}
291+
}

0 commit comments

Comments
 (0)