Skip to content

Commit c8634eb

Browse files
committed
feat!: skip uninteresting commits for blame
feat!: take commitgraph cache as argument feat!: replace argument `traverse` by `suspect` Switch to date order for traversing the commit history, as opposed to topo order. This is also what `git blame` does. Skip suspects that have no associated unblamed hunks Pass blame to parent in `process_change`. `git`’s algorithm only seems to keep the current suspect for unblamed hunks that were the direct result of splitting an existing unblamed hunk because it matched with a change. All other hunks appear to be blamed on the parent without further checks. Add assertion that lines always match
1 parent 6c50434 commit c8634eb

File tree

6 files changed

+226
-90
lines changed

6 files changed

+226
-90
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gix-blame/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ rust-version = "1.70"
1414
doctest = false
1515

1616
[dependencies]
17+
gix-commitgraph = { version = "^0.26.0", path = "../gix-commitgraph" }
18+
gix-revwalk = { version = "^0.18.0", path = "../gix-revwalk" }
1719
gix-trace = { version = "^0.1.12", path = "../gix-trace" }
1820
gix-diff = { version = "^0.50.0", path = "../gix-diff", default-features = false, features = ["blob"] }
1921
gix-object = { version = "^0.47.0", path = "../gix-object" }
2022
gix-hash = { version = "^0.16.0", path = "../gix-hash" }
2123
gix-worktree = { version = "^0.39.0", path = "../gix-worktree", default-features = false, features = ["attributes"] }
2224
gix-traverse = { version = "^0.44.0", path = "../gix-traverse" }
2325

26+
smallvec = "1.10.0"
2427
thiserror = "2.0.0"
2528

2629
[dev-dependencies]

gix-blame/src/file/function.rs

Lines changed: 154 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,23 @@ use gix_object::{
77
bstr::{BStr, BString},
88
FindExt,
99
};
10+
use gix_traverse::commit::find;
11+
use smallvec::SmallVec;
1012
use std::num::NonZeroU32;
1113
use std::ops::Range;
1214

1315
/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file
14-
/// at `traverse[0]:<file_path>` originated in.
16+
/// at `suspect:<file_path>` originated in.
1517
///
1618
/// ## Paramters
1719
///
1820
/// * `odb`
1921
/// - Access to database objects, also for used for diffing.
2022
/// - Should have an object cache for good diff performance.
21-
/// * `traverse`
22-
/// - The list of commits from the most recent to prior ones, following all parents sorted
23-
/// by time.
24-
/// - It's paramount that older commits are returned after newer ones.
25-
/// - The first commit returned here is the first eligible commit to be responsible for parts of `file_path`.
23+
/// * `suspect`
24+
/// - The first commit to be responsible for parts of `file_path`.
25+
/// * `cache`
26+
/// - Optionally, the commitgraph cache.
2627
/// * `file_path`
2728
/// - A *slash-separated* worktree-relative path to the file to blame.
2829
/// * `range`
@@ -60,20 +61,14 @@ use std::ops::Range;
6061
// <---><----------><-------><-----><------->
6162
// <---><---><-----><-------><-----><------->
6263
// <---><---><-----><-------><-----><-><-><->
63-
pub fn file<E>(
64+
pub fn file(
6465
odb: impl gix_object::Find + gix_object::FindHeader,
65-
traverse: impl IntoIterator<Item = Result<gix_traverse::commit::Info, E>>,
66+
suspect: ObjectId,
67+
cache: Option<gix_commitgraph::Graph>,
6668
resource_cache: &mut gix_diff::blob::Platform,
6769
file_path: &BStr,
6870
range: Option<Range<u32>>,
69-
) -> Result<Outcome, Error>
70-
where
71-
E: Into<Box<dyn std::error::Error + Send + Sync + 'static>>,
72-
{
73-
let mut traverse = traverse.into_iter().peekable();
74-
let Some(Ok(suspect)) = traverse.peek().map(|res| res.as_ref().map(|item| item.id)) else {
75-
return Err(Error::EmptyTraversal);
76-
};
71+
) -> Result<Outcome, Error> {
7772
let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect);
7873

7974
let mut stats = Statistics::default();
@@ -103,25 +98,43 @@ where
10398
suspects: [(suspect, range_in_blamed_file)].into(),
10499
}];
105100

101+
let mut buf = Vec::new();
102+
let commit = find(cache.as_ref(), &odb, &suspect, &mut buf)?;
103+
104+
let mut queue: gix_revwalk::PriorityQueue<CommitTime, ObjectId> = gix_revwalk::PriorityQueue::new();
105+
106+
let commit_time = commit_time(commit);
107+
queue.insert(commit_time, suspect);
108+
106109
let mut out = Vec::new();
107110
let mut diff_state = gix_diff::tree::State::default();
108111
let mut previous_entry: Option<(ObjectId, ObjectId)> = None;
109-
'outer: while let Some(item) = traverse.next() {
112+
'outer: while let Some(suspect) = queue.pop_value() {
110113
if hunks_to_blame.is_empty() {
111114
break;
112115
}
113-
let commit = item.map_err(|err| Error::Traverse(err.into()))?;
114-
let suspect = commit.id;
116+
117+
let is_still_suspect = hunks_to_blame.iter().any(|hunk| hunk.suspects.contains_key(&suspect));
118+
119+
if !is_still_suspect {
120+
// There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with
121+
// the next one.
122+
continue 'outer;
123+
}
124+
115125
stats.commits_traversed += 1;
116126

117-
let parent_ids = commit.parent_ids;
127+
let commit = find(cache.as_ref(), &odb, &suspect, &mut buf)?;
128+
129+
let parent_ids: ParentIds = collect_parents(commit, &odb, cache.as_ref());
130+
118131
if parent_ids.is_empty() {
119-
if traverse.peek().is_none() {
120-
// I’m not entirely sure if this is correct yet. `suspect`, at this point, is the `id` of
121-
// the last `item` that was yielded by `traverse`, so it makes sense to assign the
122-
// remaining lines to it, even though we don’t explicitly check whether that is true
123-
// here. We could perhaps use diff-tree-to-tree to compare `suspect`
124-
// against an empty tree to validate this assumption.
132+
if queue.is_empty() {
133+
// I’m not entirely sure if this is correct yet. `suspect`, at this point, is the
134+
// `id` of the last `item` that was yielded by `queue`, so it makes sense to assign
135+
// the remaining lines to it, even though we don’t explicitly check whether that is
136+
// true here. We could perhaps use diff-tree-to-tree to compare `suspect` against
137+
// an empty tree to validate this assumption.
125138
if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
126139
break 'outer;
127140
}
@@ -143,7 +156,41 @@ where
143156
continue;
144157
};
145158

146-
for (pid, parent_id) in parent_ids.iter().enumerate() {
159+
// This block asserts that, for every `UnblamedHunk`, all lines in the *Blamed File* are
160+
// identical to the corresponding lines in the *Source File*.
161+
#[cfg(debug_assertions)]
162+
{
163+
let source_blob = odb.find_blob(&entry_id, &mut buf)?.data.to_vec();
164+
let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100);
165+
let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob)
166+
.tokenize()
167+
.map(|token| source_interner.intern(token))
168+
.collect();
169+
170+
let mut blamed_interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100);
171+
let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&blamed_file_blob)
172+
.tokenize()
173+
.map(|token| blamed_interner.intern(token))
174+
.collect();
175+
176+
for hunk in hunks_to_blame.iter() {
177+
if let Some(range_in_suspect) = hunk.suspects.get(&suspect) {
178+
let range_in_blamed_file = hunk.range_in_blamed_file.clone();
179+
180+
for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) {
181+
let source_token = source_lines_as_tokens[source_line_number as usize];
182+
let blame_token = blamed_lines_as_tokens[blamed_line_number as usize];
183+
184+
let source_line = BString::new(source_interner[source_token].into());
185+
let blamed_line = BString::new(blamed_interner[blame_token].into());
186+
187+
assert_eq!(source_line, blamed_line);
188+
}
189+
}
190+
}
191+
}
192+
193+
for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() {
147194
if let Some(parent_entry_id) =
148195
find_path_entry_in_commit(&odb, parent_id, file_path, &mut buf, &mut buf2, &mut stats)?
149196
{
@@ -153,17 +200,19 @@ where
153200
}
154201
if no_change_in_entry {
155202
pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame);
203+
queue.insert(*parent_commit_time, *parent_id);
156204
continue 'outer;
157205
}
158206
}
159207
}
160208

161209
let more_than_one_parent = parent_ids.len() > 1;
162-
for parent_id in parent_ids {
210+
for (parent_id, parent_commit_time) in parent_ids {
211+
queue.insert(parent_commit_time, parent_id);
163212
let changes_for_file_path = tree_diff_at_file_path(
164213
&odb,
165214
file_path,
166-
commit.id,
215+
suspect,
167216
parent_id,
168217
&mut stats,
169218
&mut diff_state,
@@ -588,8 +637,82 @@ fn find_path_entry_in_commit(
588637
Ok(res.map(|e| e.oid))
589638
}
590639

591-
/// Return an iterator over tokens for use in diffing. These usually lines, but iit's important to unify them
592-
/// so the later access shows the right thing.
640+
type CommitTime = i64;
641+
642+
fn commit_time(commit: gix_traverse::commit::Either<'_, '_>) -> CommitTime {
643+
match commit {
644+
gix_traverse::commit::Either::CommitRefIter(commit_ref_iter) => {
645+
let mut commit_time = 0;
646+
for token in commit_ref_iter {
647+
use gix_object::commit::ref_iter::Token as T;
648+
match token {
649+
Ok(T::Tree { .. }) => continue,
650+
Ok(T::Parent { .. }) => continue,
651+
Ok(T::Author { .. }) => continue,
652+
Ok(T::Committer { signature }) => {
653+
commit_time = signature.time.seconds;
654+
break;
655+
}
656+
Ok(_unused_token) => break,
657+
Err(_err) => todo!(),
658+
}
659+
}
660+
commit_time
661+
}
662+
gix_traverse::commit::Either::CachedCommit(commit) => commit.committer_timestamp() as i64,
663+
}
664+
}
665+
666+
type ParentIds = SmallVec<[(gix_hash::ObjectId, i64); 2]>;
667+
668+
fn collect_parents(
669+
commit: gix_traverse::commit::Either<'_, '_>,
670+
odb: &impl gix_object::Find,
671+
cache: Option<&gix_commitgraph::Graph>,
672+
) -> ParentIds {
673+
let mut parent_ids: ParentIds = Default::default();
674+
675+
match commit {
676+
gix_traverse::commit::Either::CachedCommit(commit) => {
677+
let cache = cache
678+
.as_ref()
679+
.expect("find returned a cached commit, so we expect cache to be present");
680+
for parent_id in commit.iter_parents() {
681+
match parent_id {
682+
Ok(pos) => {
683+
let parent = cache.commit_at(pos);
684+
685+
parent_ids.push((parent.id().to_owned(), parent.committer_timestamp() as i64));
686+
}
687+
Err(_) => todo!(),
688+
}
689+
}
690+
}
691+
gix_traverse::commit::Either::CommitRefIter(commit_ref_iter) => {
692+
for token in commit_ref_iter {
693+
match token {
694+
Ok(gix_object::commit::ref_iter::Token::Tree { .. }) => continue,
695+
Ok(gix_object::commit::ref_iter::Token::Parent { id }) => {
696+
let mut buf = Vec::new();
697+
let parent = odb.find_commit_iter(id.as_ref(), &mut buf).ok();
698+
let parent_commit_time = parent
699+
.and_then(|parent| parent.committer().ok().map(|committer| committer.time.seconds))
700+
.unwrap_or_default();
701+
702+
parent_ids.push((id, parent_commit_time));
703+
}
704+
Ok(_unused_token) => break,
705+
Err(_err) => todo!(),
706+
}
707+
}
708+
}
709+
};
710+
711+
parent_ids
712+
}
713+
714+
/// Return an iterator over tokens for use in diffing. These are usually lines, but it's important
715+
/// to unify them so the later access shows the right thing.
593716
pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource<Token = &[u8]> {
594717
gix_diff::blob::sources::byte_lines_with_terminator(data)
595718
}

gix-blame/src/file/mod.rs

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ fn process_change(
6666

6767
// Nothing to do with `hunk` except shifting it,
6868
// but `unchanged` needs to be checked against the next hunk to catch up.
69-
new_hunks_to_blame.push(hunk.cloned_blame(suspect, parent).shift_by(parent, *offset));
69+
new_hunks_to_blame.push(hunk.passed_blame(suspect, parent).shift_by(parent, *offset));
7070
(None, Some(Change::Unchanged(unchanged)))
7171
}
7272
(false, false) => {
@@ -95,7 +95,7 @@ fn process_change(
9595

9696
// Nothing to do with `hunk` except shifting it,
9797
// but `unchanged` needs to be checked against the next hunk to catch up.
98-
new_hunks_to_blame.push(hunk.cloned_blame(suspect, parent).shift_by(parent, *offset));
98+
new_hunks_to_blame.push(hunk.passed_blame(suspect, parent).shift_by(parent, *offset));
9999
(None, Some(Change::Unchanged(unchanged)))
100100
}
101101
}
@@ -125,7 +125,7 @@ fn process_change(
125125
}
126126
Either::Right((before, after)) => {
127127
// requeue the left side `before` after offsetting it…
128-
new_hunks_to_blame.push(before.cloned_blame(suspect, parent).shift_by(parent, *offset));
128+
new_hunks_to_blame.push(before.passed_blame(suspect, parent).shift_by(parent, *offset));
129129
// …and treat `after` as `new_hunk`, which contains the `added` range.
130130
after
131131
}
@@ -162,7 +162,7 @@ fn process_change(
162162
Either::Left(hunk) => hunk,
163163
Either::Right((before, after)) => {
164164
// Keep looking for the left side of the unblamed portion.
165-
new_hunks_to_blame.push(before.cloned_blame(suspect, parent).shift_by(parent, *offset));
165+
new_hunks_to_blame.push(before.passed_blame(suspect, parent).shift_by(parent, *offset));
166166
after
167167
}
168168
};
@@ -220,7 +220,7 @@ fn process_change(
220220
// <----> (added)
221221

222222
// Retry `hunk` once there is overlapping changes to process.
223-
new_hunks_to_blame.push(hunk.cloned_blame(suspect, parent).shift_by(parent, *offset));
223+
new_hunks_to_blame.push(hunk.passed_blame(suspect, parent).shift_by(parent, *offset));
224224

225225
// Let hunks catchup with this change.
226226
(
@@ -273,7 +273,7 @@ fn process_change(
273273
}
274274
Either::Right((before, after)) => {
275275
// `before` isn't affected by deletion, so keep it for later.
276-
new_hunks_to_blame.push(before.cloned_blame(suspect, parent).shift_by(parent, *offset));
276+
new_hunks_to_blame.push(before.passed_blame(suspect, parent).shift_by(parent, *offset));
277277
// after will be affected by offset, and we will see if there are more changes affecting it.
278278
after
279279
}
@@ -285,7 +285,7 @@ fn process_change(
285285
// | (line_number_in_destination)
286286

287287
// Catchup with changes.
288-
new_hunks_to_blame.push(hunk.cloned_blame(suspect, parent).shift_by(parent, *offset));
288+
new_hunks_to_blame.push(hunk.passed_blame(suspect, parent).shift_by(parent, *offset));
289289

290290
(
291291
None,
@@ -295,7 +295,7 @@ fn process_change(
295295
}
296296
(Some(hunk), None) => {
297297
// nothing to do - changes are exhausted, re-evaluate `hunk`.
298-
new_hunks_to_blame.push(hunk.cloned_blame(suspect, parent).shift_by(parent, *offset));
298+
new_hunks_to_blame.push(hunk.passed_blame(suspect, parent).shift_by(parent, *offset));
299299
(None, None)
300300
}
301301
(None, Some(Change::Unchanged(_))) => {
@@ -402,18 +402,20 @@ impl UnblamedHunk {
402402
}
403403
}
404404

405-
/// Transfer all ranges from the commit at `from` to the commit at `to`.
406-
fn pass_blame(&mut self, from: ObjectId, to: ObjectId) {
405+
/// This is like [`Self::pass_blame()`], but easier to use in places where the 'passing' is
406+
/// done 'inline'.
407+
fn passed_blame(mut self, from: ObjectId, to: ObjectId) -> Self {
407408
if let Some(range_in_suspect) = self.suspects.remove(&from) {
408409
self.suspects.insert(to, range_in_suspect);
409410
}
411+
self
410412
}
411413

412-
/// This is like [`Self::clone_blame()`], but easier to use in places
413-
/// where the cloning is done 'inline'.
414-
fn cloned_blame(mut self, from: ObjectId, to: ObjectId) -> Self {
415-
self.clone_blame(from, to);
416-
self
414+
/// Transfer all ranges from the commit at `from` to the commit at `to`.
415+
fn pass_blame(&mut self, from: ObjectId, to: ObjectId) {
416+
if let Some(range_in_suspect) = self.suspects.remove(&from) {
417+
self.suspects.insert(to, range_in_suspect);
418+
}
417419
}
418420

419421
fn clone_blame(&mut self, from: ObjectId, to: ObjectId) {

0 commit comments

Comments
 (0)