Skip to content

Commit 95ad56c

Browse files
committed
Merge branch 'optimize_hashtables'
2 parents 689752e + 2761466 commit 95ad56c

File tree

19 files changed

+135
-44
lines changed

19 files changed

+135
-44
lines changed

Cargo.lock

Lines changed: 20 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ members = [
174174
"git-path",
175175
"git-repository",
176176
"gitoxide-core",
177+
"git-hashtable",
177178
"git-tui",
178179
"git-tix",
179180

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ is usable to some extend.
156156
* [git-worktree](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-worktree)
157157
* [git-bitmap](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-bitmap)
158158
* [git-date](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-date)
159+
* [git-hashtable](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-hashtable)
159160
* **idea** _(just a name placeholder)_
160161
* [git-note](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-note)
161162
* [git-fetchhead](https://github.com/Byron/gitoxide/blob/main/crate-status.md#git-fetchhead)

crate-status.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
* [x] decode the chunk file table of contents and provide convenient API
1212
* [x] write the table of contents
1313

14+
### git-hashtable
15+
16+
* [x] hashmap
17+
* [x] hashset
18+
19+
1420
### git-object
1521
* *decode (zero-copy)* borrowed objects
1622
* [x] commit

etc/check-package-size.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ echo "in root: gitoxide CLI"
3232
(enter git-config && indent cargo diet -n --package-size-limit 120KB)
3333
(enter git-config-value && indent cargo diet -n --package-size-limit 20KB)
3434
(enter git-command && indent cargo diet -n --package-size-limit 5KB)
35-
(enter git-hash && indent cargo diet -n --package-size-limit 20KB)
35+
(enter git-hash && indent cargo diet -n --package-size-limit 30KB)
3636
(enter git-chunk && indent cargo diet -n --package-size-limit 10KB)
3737
(enter git-rebase && indent cargo diet -n --package-size-limit 5KB)
3838
(enter git-sequencer && indent cargo diet -n --package-size-limit 5KB)
@@ -43,6 +43,7 @@ echo "in root: gitoxide CLI"
4343
(enter git-url && indent cargo diet -n --package-size-limit 20KB)
4444
(enter git-validate && indent cargo diet -n --package-size-limit 5KB)
4545
(enter git-date && indent cargo diet -n --package-size-limit 15KB)
46+
(enter git-hashtable && indent cargo diet -n --package-size-limit 5KB)
4647
(enter git-filter && indent cargo diet -n --package-size-limit 5KB)
4748
(enter git-lfs && indent cargo diet -n --package-size-limit 5KB)
4849
(enter git-note && indent cargo diet -n --package-size-limit 5KB)

git-hash/src/object_id.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,29 @@
1+
use std::hash::{Hash, Hasher};
12
use std::{borrow::Borrow, convert::TryInto, fmt, ops::Deref};
23

34
use crate::{borrowed::oid, Kind, SIZE_OF_SHA1_DIGEST};
45

56
/// An owned hash identifying objects, most commonly Sha1
6-
#[derive(PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)]
7+
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy)]
78
#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
89
pub enum ObjectId {
910
/// A SHA 1 hash digest
1011
Sha1([u8; SIZE_OF_SHA1_DIGEST]),
1112
}
1213

14+
// False positive: https://github.com/rust-lang/rust-clippy/issues/2627
15+
// ingoring some fields while hashing is perfectly valid and just leads to
16+
// increased HashCollisions. One Sha1 being a prefix of another Sha256 is
17+
// extremly unlikely to begin with so it doesn't matter.
18+
// This implementation matches the `Hash` implementation for `oid`
19+
// and allows the usage of custom Hashers that only copy a truncated ShaHash
20+
#[allow(clippy::derive_hash_xor_eq)]
21+
impl Hash for ObjectId {
22+
fn hash<H: Hasher>(&self, state: &mut H) {
23+
state.write(self.as_slice())
24+
}
25+
}
26+
1327
#[allow(missing_docs)]
1428
pub mod decode {
1529
use std::str::FromStr;

git-hashtable/Cargo.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[package]
2+
name = "git-hashtable"
3+
version = "0.1.0"
4+
repository = "https://github.com/Byron/gitoxide"
5+
license = "MIT/Apache-2.0"
6+
description = "A crate that provides hashtable based data structures optimized to utilize ObjectId keys"
7+
authors = ["Pascal Kuthe <[email protected]>"]
8+
edition = "2021"
9+
include = ["src/**/*"]
10+
11+
[lib]
12+
doctest = false
13+
14+
[dependencies]
15+
hashbrown = { version = "0.13.1", default-features = false, features = [
16+
"inline-more",
17+
"raw"
18+
] }
19+
git-hash = { version = "^0.10.0", path = "../git-hash" }
20+

git-hashtable/src/lib.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//! Customized HashMap and Hasher implementation optimized for using `ObjectId`s as keys.
2+
//!
3+
//! The crate mirrors `std::collections` in layout for familiarity.
4+
#![deny(missing_docs, rust_2018_idioms)]
5+
#![forbid(unsafe_code)]
6+
7+
use git_hash::ObjectId;
8+
9+
pub use hashbrown::{hash_map, hash_set, raw, Equivalent};
10+
11+
///
12+
pub mod hash {
13+
/// A Hasher for usage with HashMap keys that are already robust hashes (like an `ObjectId`).
14+
/// The first `8` bytes of the hash are used as the `HashMap` hash
15+
#[derive(Default, Clone, Copy)]
16+
pub struct Hasher(u64);
17+
18+
impl std::hash::Hasher for Hasher {
19+
fn finish(&self) -> u64 {
20+
self.0
21+
}
22+
23+
#[inline(always)]
24+
fn write(&mut self, bytes: &[u8]) {
25+
self.0 = u64::from_ne_bytes(bytes[..8].try_into().unwrap());
26+
}
27+
}
28+
29+
/// A Hasher for usage with HashMap keys that are already robust hashes (like an `ObjectId`).
30+
/// The first `8` bytes of the hash are used as the `HashMap` hash
31+
#[derive(Default, Clone, Copy)]
32+
pub struct Builder;
33+
impl std::hash::BuildHasher for Builder {
34+
type Hasher = Hasher;
35+
36+
fn build_hasher(&self) -> Self::Hasher {
37+
Hasher::default()
38+
}
39+
}
40+
}
41+
42+
/// A HashMap for usage with keys that are already robust hashes (like an `ObjectId`).
43+
/// The first `8` bytes of the hash are used as the `HashMap` hash
44+
pub type HashMap<K, V> = hashbrown::HashMap<K, V, hash::Builder>;
45+
/// A HashSet for usage with keys that are already robust hashes (like an `ObjectId`).
46+
/// The first `8` bytes of the hash are used as the `HashMap` hash
47+
pub type HashSet<T = ObjectId> = hashbrown::HashSet<T, hash::Builder>;

git-pack/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ git-object = { version = "^0.23.0", path = "../git-object" }
4545
git-traverse = { version = "^0.19.0", path = "../git-traverse" }
4646
git-diff = { version = "^0.23.0", path = "../git-diff" }
4747
git-tempfile = { version = "^3.0.0", path = "../git-tempfile" }
48+
git-hashtable = { version = "^0.1.0", path = "../git-hashtable" }
4849

4950
smallvec = "1.3.0"
5051
memmap2 = "0.5.0"
@@ -55,7 +56,6 @@ thiserror = "1.0.26"
5556
uluru = { version = "3.0.0", optional = true }
5657
clru = { version = "0.5.0", optional = true }
5758
dashmap = "5.1.0"
58-
hash_hasher = "2.0.3"
5959

6060
document-features = { version = "0.2.0", optional = true }
6161

git-pack/src/cache/object.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mod memory {
2929

3030
/// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes.
3131
pub struct MemoryCappedHashmap {
32-
inner: clru::CLruCache<Key, Entry, hash_hasher::HashBuildHasher, CustomScale>,
32+
inner: clru::CLruCache<Key, Entry, git_hashtable::hash::Builder, CustomScale>,
3333
free_list: Vec<Vec<u8>>,
3434
debug: git_features::cache::Debug,
3535
}
@@ -45,7 +45,7 @@ mod memory {
4545
MemoryCappedHashmap {
4646
inner: clru::CLruCache::with_config(
4747
clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero"))
48-
.with_hasher(hash_hasher::HashBuildHasher::default())
48+
.with_hasher(git_hashtable::hash::Builder::default())
4949
.with_scale(CustomScale),
5050
),
5151
free_list: Vec::new(),

git-pack/src/data/output/count/objects/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use std::{
55

66
use git_features::{parallel, progress::Progress};
77
use git_hash::ObjectId;
8-
use hash_hasher::{HashBuildHasher, HashedSet};
98

109
use crate::{data::output, find};
1110

@@ -65,7 +64,7 @@ where
6564
inner: objects_ids,
6665
size: chunk_size,
6766
};
68-
let seen_objs = dashmap::DashSet::<ObjectId, HashBuildHasher>::default();
67+
let seen_objs = dashmap::DashSet::<ObjectId, git_hashtable::hash::Builder>::default();
6968
let progress = Arc::new(parking_lot::Mutex::new(progress));
7069

7170
parallel::in_parallel(
@@ -120,7 +119,7 @@ where
120119
Oid: Into<ObjectId>,
121120
IterErr: std::error::Error,
122121
{
123-
let seen_objs = RefCell::new(HashedSet::<ObjectId>::default());
122+
let seen_objs = RefCell::new(git_hashtable::HashSet::default());
124123

125124
let (mut buf1, mut buf2) = (Vec::new(), Vec::new());
126125
expand::this(

git-pack/src/data/output/count/objects/util.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@ mod trait_impls {
66
use std::{cell::RefCell, hash::Hash};
77

88
use dashmap::DashSet;
9-
use hash_hasher::{HashBuildHasher, HashedSet};
9+
use git_hashtable::HashSet;
1010

1111
use super::InsertImmutable;
1212

13-
impl<T: Eq + Hash> InsertImmutable<T> for DashSet<T, HashBuildHasher> {
13+
impl<T: Eq + Hash> InsertImmutable<T> for DashSet<T, git_hashtable::hash::Builder> {
1414
fn insert(&self, item: T) -> bool {
1515
self.insert(item)
1616
}
1717
}
1818

19-
impl<T: Eq + Hash> InsertImmutable<T> for RefCell<HashedSet<T>> {
19+
impl<T: Eq + Hash> InsertImmutable<T> for RefCell<HashSet<T>> {
2020
fn insert(&self, item: T) -> bool {
2121
self.borrow_mut().insert(item)
2222
}

git-repository/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ git-credentials = { version = "^0.7.0", path = "../git-credentials" }
115115
git-prompt = { version = "^0.2.0", path = "../git-prompt" }
116116
git-index = { version = "^0.9.1", path = "../git-index" }
117117
git-worktree = { version = "^0.9.0", path = "../git-worktree" }
118+
git-hashtable = { version = "^0.1.0", path = "../git-hashtable" }
118119

119120
once_cell = "1.14.0"
120121
signal-hook = { version = "0.3.9", default-features = false }

git-repository/src/commit.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ pub mod describe {
2020
use std::borrow::Cow;
2121

2222
use git_hash::ObjectId;
23+
use git_hashtable::HashMap;
2324
use git_odb::Find;
2425

2526
use crate::{bstr::BStr, ext::ObjectIdExt, Repository};
@@ -66,10 +67,7 @@ pub mod describe {
6667
}
6768

6869
impl SelectRef {
69-
fn names(
70-
&self,
71-
repo: &Repository,
72-
) -> Result<git_revision::hash_hasher::HashedMap<ObjectId, Cow<'static, BStr>>, Error> {
70+
fn names(&self, repo: &Repository) -> Result<HashMap<ObjectId, Cow<'static, BStr>>, Error> {
7371
let platform = repo.references()?;
7472

7573
Ok(match self {

git-revision/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ serde1 = [ "serde", "git-hash/serde1", "git-object/serde1" ]
1919
git-hash = { version = "^0.10.0", path = "../git-hash" }
2020
git-object = { version = "^0.23.0", path = "../git-object" }
2121
git-date = { version = "^0.3.0", path = "../git-date" }
22+
git-hashtable = { version = "^0.1.0", path = "../git-hashtable" }
2223

2324
bstr = { version = "1.0.1", default-features = false, features = ["std"]}
24-
hash_hasher = "2.0.3"
2525
thiserror = "1.0.26"
2626
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
2727
document-features = { version = "0.2.1", optional = true }

0 commit comments

Comments
 (0)