Skip to content

Commit 8dad3ff

Browse files
committed
[commitgraph] Implement basic commit-graph file verification.
Missing features: 1. It operates on commit-graph files only, so it doesn't verify that commit-graph data matches `git-odb` data. 2. No progress reporting or parallelization. This shouldn't be needed until until we need to check against `git-odb` data. Example output for Linux repo: ``` $ time ./target/release/gixp commit-graph-verify -s ~/src/linux/.git/objects/info number of commits with the given number of parents 0: 4 1: 878988 2: 67800 3: 652 4: 408 5: 382 6: 454 7: 95 8: 65 9: 47 10: 25 11: 26 12: 14 13: 4 14: 3 18: 1 19: 1 20: 1 21: 1 24: 1 27: 1 30: 1 32: 1 66: 1 ->: 948976 longest path length between two commits: 160521 real 0m0.196s user 0m0.180s sys 0m0.016s ```
1 parent 701f33c commit 8dad3ff

File tree

19 files changed

+457
-21
lines changed

19 files changed

+457
-21
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

git-commitgraph/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,17 @@ include = ["src/**/*"]
1212
[lib]
1313
doctest = false
1414

15-
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
15+
[features]
16+
serde1 = ["serde", "git-object/serde1"]
1617

1718
[dependencies]
19+
git-features = { version = "^0.6.0", path = "../git-features" }
1820
git-object = { version = "^0.4.0", path = "../git-object" }
1921

2022
bstr = { version = "0.2.13", default-features = false, features = ["std"] }
2123
byteorder = "1.2.3"
2224
filebuffer = "0.4.0"
25+
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
2326
thiserror = "1.0.20"
2427

2528
[dev-dependencies]

git-commitgraph/src/file/access.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ use std::{
88

99
/// Access
1010
impl File {
11+
pub fn base_graph_count(&self) -> u8 {
12+
self.base_graph_count
13+
}
14+
1115
/// Returns the commit data for the commit located at the given lexigraphical position.
1216
///
1317
/// `pos` must range from 0 to self.num_commits().

git-commitgraph/src/file/commit.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ impl<'a> Commit<'a> {
9090
self.iter_parents().next().transpose()
9191
}
9292

93+
pub fn position(&self) -> file::Position {
94+
self.pos
95+
}
96+
9397
// Allow the return value to outlive this Commit object, as it only needs to be bound by the
9498
// lifetime of the parent file.
9599
pub fn root_tree_id<'b>(&'b self) -> borrowed::Id<'a>

git-commitgraph/src/file/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
mod access;
33
pub mod commit;
44
mod init;
5+
pub mod verify;
6+
57
pub use init::Error;
68

79
pub use commit::Commit;

git-commitgraph/src/file/verify.rs

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
use crate::{
2+
file::{self, File},
3+
GENERATION_NUMBER_INFINITY, GENERATION_NUMBER_MAX,
4+
};
5+
use bstr::ByteSlice;
6+
use git_object::{borrowed, owned, SHA1_SIZE};
7+
use std::cmp::{max, min};
8+
use std::collections::HashMap;
9+
use std::convert::TryFrom;
10+
use std::path::Path;
11+
12+
#[derive(thiserror::Error, Debug)]
13+
pub enum Error {
14+
#[error(transparent)]
15+
Commit(#[from] file::commit::Error),
16+
#[error("commit at file position {pos} has invalid ID {id}")]
17+
CommitId { id: owned::Id, pos: file::Position },
18+
#[error("commit at file position {pos} with ID {id} is out of order relative to its predecessor with ID {predecessor_id}")]
19+
CommitsOutOfOrder {
20+
id: owned::Id,
21+
pos: file::Position,
22+
predecessor_id: owned::Id,
23+
},
24+
#[error("commit-graph filename should be {0}")]
25+
Filename(String),
26+
#[error("commit {id} has invalid generation {generation}")]
27+
Generation { generation: u32, id: owned::Id },
28+
#[error("checksum mismatch: expected {expected}, got {actual}")]
29+
Mismatch { expected: owned::Id, actual: owned::Id },
30+
#[error("commit {id} has invalid root tree ID {root_tree_id}")]
31+
RootTreeId { id: owned::Id, root_tree_id: owned::Id },
32+
}
33+
34+
// This is a separate type to let `traverse`'s caller use the same error type for its result and its
35+
// processor error type while also letting that error type contain file::verify::Error values.
36+
// Is there a better way? Should the caller's error type just use boxes to avoid recursive type
37+
// errors?
38+
#[derive(thiserror::Error, Debug)]
39+
pub enum EitherError<E1: std::error::Error + 'static, E2: std::error::Error + 'static> {
40+
#[error(transparent)]
41+
Internal(#[from] E1),
42+
// Why can't I use #[from] here? Boo!
43+
#[error("{0}")]
44+
Processor(#[source] E2),
45+
}
46+
47+
#[derive(Clone, Debug, Eq, PartialEq)]
48+
#[cfg_attr(feature = "serde1", derive(serde::Deserialize, serde::Serialize))]
49+
pub struct Outcome {
50+
pub max_generation: u32,
51+
pub max_parents: u32,
52+
pub min_generation: u32,
53+
pub num_commits: u32,
54+
pub parent_counts: HashMap<u32, u32>,
55+
}
56+
57+
impl File {
58+
pub fn checksum(&self) -> borrowed::Id<'_> {
59+
borrowed::Id::try_from(&self.data[self.data.len() - SHA1_SIZE..]).expect("file to be large enough for a hash")
60+
}
61+
62+
pub fn traverse<'a, E, Processor>(&'a self, mut processor: Processor) -> Result<Outcome, EitherError<Error, E>>
63+
where
64+
E: std::error::Error + 'static,
65+
Processor: FnMut(&file::Commit<'a>) -> Result<(), E>,
66+
{
67+
self.verify_checksum()?;
68+
verify_split_chain_filename_hash(&self.path, self.checksum())?;
69+
70+
// This probably belongs in borrowed::Id itself?
71+
let null_id = borrowed::Id::from(&[0u8; SHA1_SIZE]);
72+
73+
let mut stats = Outcome {
74+
max_generation: 0,
75+
max_parents: 0,
76+
min_generation: GENERATION_NUMBER_INFINITY,
77+
num_commits: self.num_commits(),
78+
parent_counts: HashMap::new(),
79+
};
80+
81+
// TODO: Verify self.fan values as we go.
82+
let mut prev_id: borrowed::Id<'a> = null_id;
83+
for commit in self.iter_commits() {
84+
if commit.id() <= prev_id {
85+
if commit.id() == null_id {
86+
return Err(Error::CommitId {
87+
pos: commit.position(),
88+
id: commit.id().into(),
89+
}
90+
.into());
91+
}
92+
return Err(Error::CommitsOutOfOrder {
93+
pos: commit.position(),
94+
id: commit.id().into(),
95+
predecessor_id: prev_id.into(),
96+
}
97+
.into());
98+
}
99+
if commit.root_tree_id() == null_id {
100+
return Err(Error::RootTreeId {
101+
id: commit.id().into(),
102+
root_tree_id: commit.root_tree_id().into(),
103+
}
104+
.into());
105+
}
106+
if commit.generation() > GENERATION_NUMBER_MAX {
107+
return Err(Error::Generation {
108+
generation: commit.generation(),
109+
id: commit.id().into(),
110+
}
111+
.into());
112+
}
113+
114+
processor(&commit).map_err(EitherError::Processor)?;
115+
116+
stats.max_generation = max(stats.max_generation, commit.generation());
117+
stats.min_generation = min(stats.min_generation, commit.generation());
118+
let parent_count = commit
119+
.iter_parents()
120+
.try_fold(0u32, |acc, pos| pos.map(|_| acc + 1))
121+
.map_err(Error::Commit)?;
122+
*stats.parent_counts.entry(parent_count).or_insert(0) += 1;
123+
prev_id = commit.id();
124+
}
125+
126+
if stats.min_generation == GENERATION_NUMBER_INFINITY {
127+
stats.min_generation = 0;
128+
}
129+
130+
Ok(stats)
131+
}
132+
133+
pub fn verify_checksum(&self) -> Result<owned::Id, Error> {
134+
// TODO: Use/copy git_odb::hash::bytes_of_file.
135+
let data_len_without_trailer = self.data.len() - SHA1_SIZE;
136+
let mut hasher = git_features::hash::Sha1::default();
137+
hasher.update(&self.data[..data_len_without_trailer]);
138+
let actual = owned::Id::new_sha1(hasher.digest());
139+
140+
let expected = self.checksum();
141+
if actual.to_borrowed() == expected {
142+
Ok(actual)
143+
} else {
144+
Err(Error::Mismatch {
145+
actual,
146+
expected: expected.into(),
147+
})
148+
}
149+
}
150+
}
151+
152+
/// If the given path's filename matches "graph-{hash}.graph", check that `hash` matches the
153+
/// expected hash.
154+
fn verify_split_chain_filename_hash(path: impl AsRef<Path>, expected: borrowed::Id<'_>) -> Result<(), Error> {
155+
let path = path.as_ref();
156+
path.file_name()
157+
.and_then(|filename| filename.to_str())
158+
.and_then(|filename| filename.strip_suffix(".graph"))
159+
.and_then(|stem| stem.strip_prefix("graph-"))
160+
.map_or(Ok(()), |hex| match owned::Id::from_40_bytes_in_hex(hex.as_bytes()) {
161+
Ok(actual) if actual.to_borrowed() == expected => Ok(()),
162+
_ => Err(Error::Filename(format!(
163+
"graph-{}.graph",
164+
expected.to_sha1_hex().as_bstr()
165+
))),
166+
})
167+
}

git-commitgraph/src/graph/access.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl Graph {
4242

4343
/// Access fundamentals
4444
impl Graph {
45-
fn lookup_by_id(&self, id: borrowed::Id<'_>) -> Option<LookupByIdResult<'_>> {
45+
pub(crate) fn lookup_by_id(&self, id: borrowed::Id<'_>) -> Option<LookupByIdResult<'_>> {
4646
let mut current_file_start = 0;
4747
for file in &self.files {
4848
if let Some(lex_pos) = file.lookup(id) {
@@ -57,14 +57,15 @@ impl Graph {
5757
None
5858
}
5959

60-
fn lookup_by_pos(&self, pos: graph::Position) -> LookupByPositionResult<'_> {
60+
pub(crate) fn lookup_by_pos(&self, pos: graph::Position) -> LookupByPositionResult<'_> {
6161
let mut remaining = pos.0;
62-
for file in &self.files {
62+
for (file_index, file) in self.files.iter().enumerate() {
6363
match remaining.checked_sub(file.num_commits()) {
6464
Some(v) => remaining = v,
6565
None => {
6666
return LookupByPositionResult {
6767
file,
68+
file_index,
6869
pos: file::Position(remaining),
6970
}
7071
}
@@ -75,14 +76,15 @@ impl Graph {
7576
}
7677

7778
#[derive(Clone)]
78-
struct LookupByIdResult<'a> {
79+
pub(crate) struct LookupByIdResult<'a> {
7980
pub file: &'a File,
8081
pub graph_pos: graph::Position,
8182
pub file_pos: file::Position,
8283
}
8384

8485
#[derive(Clone)]
85-
struct LookupByPositionResult<'a> {
86+
pub(crate) struct LookupByPositionResult<'a> {
8687
pub file: &'a File,
88+
pub file_index: usize,
8789
pub pos: file::Position,
8890
}

git-commitgraph/src/graph/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! Operations on a complete commit graph.
22
mod access;
33
mod init;
4+
pub mod verify;
45

56
use crate::file::File;
67
use std::fmt;

0 commit comments

Comments
 (0)