Skip to content

Commit 2a663a0

Browse files
committed
Merge branch 'safety-invariants'
2 parents 17a81c7 + c85a981 commit 2a663a0

File tree

5 files changed

+319
-212
lines changed

5 files changed

+319
-212
lines changed

gix-pack/src/cache/delta/mod.rs

+4-174
Original file line numberDiff line numberDiff line change
@@ -17,184 +17,14 @@ pub mod traverse;
1717
///
1818
pub mod from_offsets;
1919

20-
/// An item stored within the [`Tree`] whose data is stored in a pack file, identified by
21-
/// the offset of its first (`offset`) and last (`next_offset`) bytes.
22-
///
23-
/// It represents either a root entry, or one that relies on a base to be resolvable,
24-
/// alongside associated `data` `T`.
25-
pub struct Item<T> {
26-
/// The offset into the pack file at which the pack entry's data is located.
27-
pub offset: crate::data::Offset,
28-
/// The offset of the next item in the pack file.
29-
pub next_offset: crate::data::Offset,
30-
/// Data to store with each Item, effectively data associated with each entry in a pack.
31-
pub data: T,
32-
/// Indices into our Tree's `items`, one for each pack entry that depends on us.
33-
///
34-
/// Limited to u32 as that's the maximum amount of objects in a pack.
35-
children: Vec<u32>,
36-
}
20+
/// Tree datastructure
21+
// kept in separate module to encapsulate unsafety (it has field invariants)
22+
mod tree;
3723

38-
/// Identify what kind of node we have last seen
39-
enum NodeKind {
40-
Root,
41-
Child,
42-
}
43-
44-
/// A tree that allows one-time iteration over all nodes and their children, consuming it in the process,
45-
/// while being shareable among threads without a lock.
46-
/// It does this by making the guarantee that iteration only happens once.
47-
pub struct Tree<T> {
48-
/// The root nodes, i.e. base objects
49-
root_items: Vec<Item<T>>,
50-
/// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects
51-
child_items: Vec<Item<T>>,
52-
/// The last encountered node was either a root or a child.
53-
last_seen: Option<NodeKind>,
54-
/// Future child offsets, associating their offset into the pack with their index in the items array.
55-
/// (parent_offset, child_index)
56-
future_child_offsets: Vec<(crate::data::Offset, usize)>,
57-
}
58-
59-
impl<T> Tree<T> {
60-
/// Instantiate a empty tree capable of storing `num_objects` amounts of items.
61-
pub fn with_capacity(num_objects: usize) -> Result<Self, Error> {
62-
Ok(Tree {
63-
root_items: Vec::with_capacity(num_objects / 2),
64-
child_items: Vec::with_capacity(num_objects / 2),
65-
last_seen: None,
66-
future_child_offsets: Vec::new(),
67-
})
68-
}
69-
70-
fn num_items(&self) -> usize {
71-
self.root_items.len() + self.child_items.len()
72-
}
73-
74-
fn assert_is_incrementing_and_update_next_offset(&mut self, offset: crate::data::Offset) -> Result<(), Error> {
75-
let items = match &self.last_seen {
76-
Some(NodeKind::Root) => &mut self.root_items,
77-
Some(NodeKind::Child) => &mut self.child_items,
78-
None => return Ok(()),
79-
};
80-
let item = &mut items.last_mut().expect("last seen won't lie");
81-
if offset <= item.offset {
82-
return Err(Error::InvariantIncreasingPackOffset {
83-
last_pack_offset: item.offset,
84-
pack_offset: offset,
85-
});
86-
}
87-
item.next_offset = offset;
88-
Ok(())
89-
}
90-
91-
fn set_pack_entries_end_and_resolve_ref_offsets(
92-
&mut self,
93-
pack_entries_end: crate::data::Offset,
94-
) -> Result<(), traverse::Error> {
95-
if !self.future_child_offsets.is_empty() {
96-
for (parent_offset, child_index) in self.future_child_offsets.drain(..) {
97-
if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) {
98-
self.child_items[i].children.push(child_index as u32);
99-
} else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) {
100-
self.root_items[i].children.push(child_index as u32);
101-
} else {
102-
return Err(traverse::Error::OutOfPackRefDelta {
103-
base_pack_offset: parent_offset,
104-
});
105-
}
106-
}
107-
}
108-
109-
self.assert_is_incrementing_and_update_next_offset(pack_entries_end)
110-
.expect("BUG: pack now is smaller than all previously seen entries");
111-
Ok(())
112-
}
113-
114-
/// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate
115-
/// custom `data` with it.
116-
pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> {
117-
self.assert_is_incrementing_and_update_next_offset(offset)?;
118-
self.last_seen = NodeKind::Root.into();
119-
self.root_items.push(Item {
120-
offset,
121-
next_offset: 0,
122-
data,
123-
children: Default::default(),
124-
});
125-
Ok(())
126-
}
127-
128-
/// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it.
129-
pub fn add_child(
130-
&mut self,
131-
base_offset: crate::data::Offset,
132-
offset: crate::data::Offset,
133-
data: T,
134-
) -> Result<(), Error> {
135-
self.assert_is_incrementing_and_update_next_offset(offset)?;
136-
137-
let next_child_index = self.child_items.len();
138-
if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) {
139-
self.child_items[i].children.push(next_child_index as u32);
140-
} else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) {
141-
self.root_items[i].children.push(next_child_index as u32);
142-
} else {
143-
self.future_child_offsets.push((base_offset, next_child_index));
144-
}
145-
146-
self.last_seen = NodeKind::Child.into();
147-
self.child_items.push(Item {
148-
offset,
149-
next_offset: 0,
150-
data,
151-
children: Default::default(),
152-
});
153-
Ok(())
154-
}
155-
}
24+
pub use tree::{Item, Tree};
15625

15726
#[cfg(test)]
15827
mod tests {
159-
mod tree {
160-
mod from_offsets_in_pack {
161-
use std::sync::atomic::AtomicBool;
162-
163-
use crate as pack;
164-
165-
const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx";
166-
const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack";
167-
168-
const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx";
169-
const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack";
170-
171-
use gix_testtools::fixture_path;
172-
173-
#[test]
174-
fn v1() -> Result<(), Box<dyn std::error::Error>> {
175-
tree(INDEX_V1, PACK_FOR_INDEX_V1)
176-
}
177-
178-
#[test]
179-
fn v2() -> Result<(), Box<dyn std::error::Error>> {
180-
tree(SMALL_PACK_INDEX, SMALL_PACK)
181-
}
182-
183-
fn tree(index_path: &str, pack_path: &str) -> Result<(), Box<dyn std::error::Error>> {
184-
let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?;
185-
crate::cache::delta::Tree::from_offsets_in_pack(
186-
&fixture_path(pack_path),
187-
idx.sorted_offsets().into_iter(),
188-
&|ofs| *ofs,
189-
&|id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)),
190-
&mut gix_features::progress::Discard,
191-
&AtomicBool::new(false),
192-
gix_hash::Kind::Sha1,
193-
)?;
194-
Ok(())
195-
}
196-
}
197-
}
19828

19929
#[test]
20030
fn size_of_pack_tree_item() {

gix-pack/src/cache/delta/traverse/mod.rs

+21-14
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,11 @@ where
132132
let object_progress = OwnShared::new(Mutable::new(object_progress));
133133

134134
let start = std::time::Instant::now();
135-
let child_items = ItemSliceSync::new(&mut self.child_items);
135+
let (mut root_items, mut child_items_vec) = self.take_root_and_child();
136+
let child_items = ItemSliceSync::new(&mut child_items_vec);
136137
let child_items = &child_items;
137138
in_parallel_with_slice(
138-
&mut self.root_items,
139+
&mut root_items,
139140
thread_limit,
140141
{
141142
{
@@ -154,16 +155,22 @@ where
154155
},
155156
{
156157
move |node, state, threads_left, should_interrupt| {
157-
resolve::deltas(
158-
object_counter.clone(),
159-
size_counter.clone(),
160-
node,
161-
state,
162-
resolve_data,
163-
object_hash.len_in_bytes(),
164-
threads_left,
165-
should_interrupt,
166-
)
158+
// SAFETY: This invariant is upheld since `child_items` and `node` come from the same Tree.
159+
// This means we can rely on Tree's invariant that node.children will be the only `children` array in
160+
// for nodes in this tree that will contain any of those children.
161+
#[allow(unsafe_code)]
162+
unsafe {
163+
resolve::deltas(
164+
object_counter.clone(),
165+
size_counter.clone(),
166+
node,
167+
state,
168+
resolve_data,
169+
object_hash.len_in_bytes(),
170+
threads_left,
171+
should_interrupt,
172+
)
173+
}
167174
}
168175
},
169176
|| (!should_interrupt.load(Ordering::Relaxed)).then(|| std::time::Duration::from_millis(50)),
@@ -174,8 +181,8 @@ where
174181
size_progress.show_throughput(start);
175182

176183
Ok(Outcome {
177-
roots: self.root_items,
178-
children: self.child_items,
184+
roots: root_items,
185+
children: child_items_vec,
179186
})
180187
}
181188
}

gix-pack/src/cache/delta/traverse/resolve.rs

+32-15
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,25 @@ mod root {
1919

2020
/// An item returned by `iter_root_chunks`, allowing access to the `data` stored alongside nodes in a [`Tree`].
2121
pub(crate) struct Node<'a, T: Send> {
22+
// SAFETY INVARIANT: see Node::new(). That function is the only one used
23+
// to create or modify these fields.
2224
item: &'a mut Item<T>,
2325
child_items: &'a ItemSliceSync<'a, Item<T>>,
2426
}
2527

2628
impl<'a, T: Send> Node<'a, T> {
27-
/// SAFETY: The child_items must be unique among between users of the `ItemSliceSync`.
29+
/// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive
30+
/// item does. All child_items must also have unique children, unless the child_item is itself `item`,
31+
/// in which case no other live item should reference it in its `item.children`.
32+
///
33+
/// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items`
34+
/// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all
35+
/// child_items are referenced at most once (really, exactly once) by a node in the tree.
36+
///
37+
/// Note that this invariant is a bit more relaxed than that on `deltas()`, because this function can be called
38+
/// for traversal within a child item, which happens in into_child_iter()
2839
#[allow(unsafe_code)]
29-
pub(in crate::cache::delta::traverse) unsafe fn new(
30-
item: &'a mut Item<T>,
31-
child_items: &'a ItemSliceSync<'a, Item<T>>,
32-
) -> Self {
40+
pub(super) unsafe fn new(item: &'a mut Item<T>, child_items: &'a ItemSliceSync<'a, Item<T>>) -> Self {
3341
Node { item, child_items }
3442
}
3543
}
@@ -52,26 +60,28 @@ mod root {
5260

5361
/// Returns true if this node has children, e.g. is not a leaf in the tree.
5462
pub fn has_children(&self) -> bool {
55-
!self.item.children.is_empty()
63+
!self.item.children().is_empty()
5664
}
5765

5866
/// Transform this `Node` into an iterator over its children.
5967
///
6068
/// Children are `Node`s referring to pack entries whose base object is this pack entry.
6169
pub fn into_child_iter(self) -> impl Iterator<Item = Node<'a, T>> + 'a {
6270
let children = self.child_items;
63-
// SAFETY: The index is a valid index into the children array.
64-
// SAFETY: The resulting mutable pointer cannot be yielded by any other node.
6571
#[allow(unsafe_code)]
66-
self.item.children.iter().map(move |&index| Node {
67-
item: unsafe { children.get_mut(index as usize) },
68-
child_items: children,
72+
self.item.children().iter().map(move |&index| {
73+
// SAFETY: Due to the invariant on new(), we can rely on these indices
74+
// being unique.
75+
let item = unsafe { children.get_mut(index as usize) };
76+
// SAFETY: Since every child_item is also required to uphold the uniqueness guarantee,
77+
// creating a Node with one of the child_items that we are allowed access to is still fine.
78+
unsafe { Node::new(item, children) }
6979
})
7080
}
7181
}
7282
}
7383

74-
pub(in crate::cache::delta::traverse) struct State<'items, F, MBFN, T: Send> {
84+
pub(super) struct State<'items, F, MBFN, T: Send> {
7585
pub delta_bytes: Vec<u8>,
7686
pub fully_resolved_delta_bytes: Vec<u8>,
7787
pub progress: Box<dyn Progress>,
@@ -80,8 +90,15 @@ pub(in crate::cache::delta::traverse) struct State<'items, F, MBFN, T: Send> {
8090
pub child_items: &'items ItemSliceSync<'items, Item<T>>,
8191
}
8292

83-
#[allow(clippy::too_many_arguments)]
84-
pub(in crate::cache::delta::traverse) fn deltas<T, F, MBFN, E, R>(
93+
/// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive
94+
/// item does. All child_items must also have unique children.
95+
///
96+
/// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items`
97+
/// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all
98+
/// child_items are referenced at most once (really, exactly once) by a node in the tree.
99+
#[allow(clippy::too_many_arguments, unsafe_code)]
100+
#[deny(unsafe_op_in_unsafe_fn)] // this is a big function, require unsafe for the one small unsafe op we have
101+
pub(super) unsafe fn deltas<T, F, MBFN, E, R>(
85102
objects: gix_features::progress::StepShared,
86103
size: gix_features::progress::StepShared,
87104
item: &mut Item<T>,
@@ -121,7 +138,7 @@ where
121138
// each node is a base, and its children always start out as deltas which become a base after applying them.
122139
// These will be pushed onto our stack until all are processed
123140
let root_level = 0;
124-
// SAFETY: The child items are unique, as `item` is the root of a tree of dependent child items.
141+
// SAFETY: This invariant is required from the caller
125142
#[allow(unsafe_code)]
126143
let root_node = unsafe { root::Node::new(item, child_items) };
127144
let mut nodes: Vec<_> = vec![(root_level, root_node)];

gix-pack/src/cache/delta/traverse/util.rs

+12-9
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,21 @@ use std::marker::PhantomData;
1616
/// more than one base. And that's what one would really have to do for two threads to encounter the same child.
1717
///
1818
/// Thus I believe it's impossible for this data structure to end up in a place where it violates its assumption.
19-
pub(in crate::cache::delta::traverse) struct ItemSliceSync<'a, T>
19+
pub(super) struct ItemSliceSync<'a, T>
2020
where
2121
T: Send,
2222
{
2323
items: *mut T,
2424
#[cfg(debug_assertions)]
2525
len: usize,
26-
phantom: PhantomData<&'a T>,
26+
phantom: PhantomData<&'a mut T>,
2727
}
2828

2929
impl<'a, T> ItemSliceSync<'a, T>
3030
where
3131
T: Send,
3232
{
33-
pub(in crate::cache::delta::traverse) fn new(items: &'a mut [T]) -> Self {
33+
pub(super) fn new(items: &'a mut [T]) -> Self {
3434
ItemSliceSync {
3535
items: items.as_mut_ptr(),
3636
#[cfg(debug_assertions)]
@@ -41,21 +41,24 @@ where
4141

4242
// SAFETY: The index must point into the slice and must not be reused concurrently.
4343
#[allow(unsafe_code)]
44-
pub(in crate::cache::delta::traverse) unsafe fn get_mut(&self, index: usize) -> &'a mut T {
44+
pub(super) unsafe fn get_mut(&self, index: usize) -> &'a mut T {
4545
#[cfg(debug_assertions)]
4646
if index >= self.len {
4747
panic!("index out of bounds: the len is {} but the index is {index}", self.len);
4848
}
49-
// SAFETY: The index is within the slice
50-
// SAFETY: The children array is alive by the 'a lifetime.
49+
// SAFETY:
50+
// - The index is within the slice (required by documentation)
51+
// - We have mutable access to `items` as ensured by Self::new()
52+
// - This is the only method on this type giving access to items
53+
// - The documentation requires that this access is unique
5154
unsafe { &mut *self.items.add(index) }
5255
}
5356
}
5457

55-
// SAFETY: T is `Send`, and we only use the pointer for creating new pointers.
58+
// SAFETY: This is logically an &mut T, which is Send if T is Send
59+
// (note: this is different from &T, which also needs T: Sync)
5660
#[allow(unsafe_code)]
5761
unsafe impl<T> Send for ItemSliceSync<'_, T> where T: Send {}
58-
// SAFETY: T is `Send`, and as long as the user follows the contract of
59-
// `get_mut()`, we only ever access one T at a time.
62+
// SAFETY: This is logically an &mut T, which is Sync if T is Sync
6063
#[allow(unsafe_code)]
6164
unsafe impl<T> Sync for ItemSliceSync<'_, T> where T: Send {}

0 commit comments

Comments
 (0)