Skip to content

Commit 4c799d3

Browse files
committed
obtain a repository-list with classifiers
If the obtained list is empty, we update against that empty list which fills the repositories tables for the first time. Additionally we provide a new sub-command to refresh the list (additions only).
1 parent fcbda1d commit 4c799d3

File tree

4 files changed

+203
-72
lines changed

4 files changed

+203
-72
lines changed

gitoxide-core/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ estimate-hours = ["dep:itertools", "dep:fs-err", "dep:crossbeam-channel", "dep:s
2222
## Gather information about repositories and store it in a database for easy querying.
2323
query = ["dep:rusqlite"]
2424
## Run algorithms on a corpus of repositories and store their results for later comparison and intelligence gathering.
25-
corpus = ["dep:rusqlite", "dep:sysinfo"]
25+
## *Note that* `organize` we need for finding git repositories fast.
26+
corpus = ["dep:rusqlite", "dep:sysinfo", "organize", "dep:crossbeam-channel"]
2627

2728
#! ### Mutually Exclusive Networking
2829
#! If both are set, _blocking-client_ will take precedence, allowing `--all-features` to be used.

gitoxide-core/src/corpus/mod.rs

Lines changed: 198 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,25 @@
1+
pub const PROGRESS_RANGE: std::ops::RangeInclusive<u8> = 0..=3;
2+
13
pub struct Engine<P> {
24
progress: P,
35
con: rusqlite::Connection,
46
gitoxide_version: String,
57
}
68

9+
pub struct RunOutcome {
10+
/// the relative path to the repositories that could not be found on disk
11+
pub missing_repos_rela_paths: usize,
12+
}
13+
714
pub mod engine {
15+
use super::db;
816
use crate::corpus::Engine;
17+
use crate::organize::find_git_repository_workdirs;
918
use anyhow::Context;
10-
use std::path::PathBuf;
19+
use bytesize::ByteSize;
20+
use rusqlite::params;
21+
use std::path::{Path, PathBuf};
22+
use std::time::Instant;
1123

1224
pub(crate) type Id = u32;
1325

@@ -26,11 +38,124 @@ pub mod engine {
2638
}
2739

2840
/// Run on the existing set of repositories we have already seen or obtain them from `path` if there is none yet.
29-
pub fn run(&self, path: PathBuf) -> anyhow::Result<()> {
30-
let _corpus_id = self.corpus_id_or_insert(&path)?;
41+
pub fn run(&mut self, corpus_path: PathBuf) -> anyhow::Result<()> {
42+
let corpus_path = gix::path::realpath(corpus_path)?;
43+
let corpus_id = self.corpus_id_or_insert(&corpus_path)?;
3144
let _gitoxide_id = self.gitoxide_version_id_or_insert()?;
3245
let _runner_id = self.runner_id_or_insert()?;
33-
todo!()
46+
let _repos = self.find_repos_or_insert(&corpus_path, corpus_id)?;
47+
todo!("do run on repos")
48+
}
49+
50+
fn find_repos(&mut self, corpus_id: Id) -> anyhow::Result<Vec<db::Repo>> {
51+
self.progress.set_name("query db-repos");
52+
self.progress.init(None, gix::progress::count("repos"));
53+
54+
Ok(self
55+
.con
56+
.prepare(
57+
"SELECT id, rela_path, odb_size, num_objects, num_references FROM repository WHERE corpus = ?1",
58+
)?
59+
.query_map([corpus_id], |r| {
60+
Ok(db::Repo {
61+
id: r.get(0)?,
62+
path: r.get::<_, String>(1)?.into(),
63+
odb_size: ByteSize(r.get(2)?),
64+
num_objects: r.get(3)?,
65+
num_references: r.get(4)?,
66+
})
67+
})?
68+
.inspect(|_| self.progress.inc())
69+
.collect::<Result<_, _>>()?)
70+
}
71+
72+
fn refresh_repos(&mut self, corpus_path: &Path, corpus_id: Id) -> anyhow::Result<Vec<db::Repo>> {
73+
let start = Instant::now();
74+
self.progress.set_name("refresh");
75+
self.progress.init(None, gix::progress::count("repos"));
76+
77+
let repos = std::thread::scope({
78+
let progress = &mut self.progress;
79+
let con = &mut self.con;
80+
|scope| -> anyhow::Result<_> {
81+
let threads = std::thread::available_parallelism()
82+
.map(std::num::NonZeroUsize::get)
83+
.ok()
84+
.unwrap_or(1);
85+
let (path_tx, repo_rx) = {
86+
let (path_tx, path_rx) = crossbeam_channel::bounded(threads * 2);
87+
let (repo_tx, repo_rx) = std::sync::mpsc::channel::<(PathBuf, anyhow::Result<db::Repo>)>();
88+
(0..threads).for_each(|_| {
89+
scope.spawn({
90+
let path_rx = path_rx.clone();
91+
let repo_tx = repo_tx.clone();
92+
move || -> anyhow::Result<_> {
93+
for repo_path in path_rx {
94+
let res = (|| {
95+
let repo = gix::open_opts(&repo_path, gix::open::Options::isolated())?;
96+
db::Repo::try_from(&repo)
97+
})();
98+
repo_tx.send((repo_path, res))?;
99+
}
100+
Ok(())
101+
}
102+
});
103+
});
104+
(path_tx, repo_rx)
105+
};
106+
107+
let find_progress = progress.add_child("find");
108+
let write_db = scope.spawn(move || -> anyhow::Result<Vec<db::Repo>> {
109+
progress.set_name("write to DB");
110+
progress.init(None, gix::progress::count("repos"));
111+
let start = Instant::now();
112+
113+
let mut out = Vec::new();
114+
let mut statement = con.prepare("INSERT INTO repository (rela_path, corpus, odb_size, num_objects, num_references) VALUES (?1, ?2, ?3, ?4, ?5)\
115+
ON CONFLICT DO UPDATE SET rela_path = rela_path, corpus = corpus, odb_size = ?3, num_objects = ?4, num_references = ?5\
116+
RETURNING id")?;
117+
for (repo_path, repo_res) in repo_rx {
118+
match repo_res {
119+
Ok(mut repo) => {
120+
let rela_path = repo.path.strip_prefix(corpus_path)?;
121+
repo.id = statement.query_row(params![rela_path.to_str().context("only valid UTF8 is allowed for repository paths")?, corpus_id, repo.odb_size.as_u64(), repo.num_objects, repo.num_references], |r| r.get(0))?;
122+
out.push(repo);
123+
progress.inc();
124+
}
125+
Err(err) => progress.fail(format!("{repo_path:?}: {err:#?}")),
126+
}
127+
}
128+
statement.finalize()?;
129+
progress.show_throughput(start);
130+
Ok(out)
131+
});
132+
133+
let repos = gix::interrupt::Iter::new(
134+
find_git_repository_workdirs(corpus_path, find_progress, false, Some(threads)),
135+
|| anyhow::anyhow!("interrupted by user"),
136+
);
137+
for res in repos {
138+
let (repo_path, _kind) = res?;
139+
path_tx.send(repo_path)?;
140+
}
141+
drop(path_tx);
142+
write_db.join().expect("no panic")
143+
}
144+
})?;
145+
146+
self.progress.show_throughput(start);
147+
Ok(repos)
148+
}
149+
150+
fn find_repos_or_insert(&mut self, corpus_path: &Path, corpus_id: Id) -> anyhow::Result<Vec<db::Repo>> {
151+
let start = Instant::now();
152+
let repos = self.find_repos(corpus_id)?;
153+
if repos.is_empty() {
154+
self.refresh_repos(corpus_path, corpus_id)
155+
} else {
156+
self.progress.show_throughput(start);
157+
Ok(repos)
158+
}
34159
}
35160
}
36161
}
@@ -39,10 +164,49 @@ pub mod db {
39164
use crate::corpus::engine::Id;
40165
use crate::corpus::Engine;
41166
use anyhow::{bail, Context};
167+
use bytesize::ByteSize;
42168
use rusqlite::{params, OptionalExtension};
43-
use std::path::Path;
169+
use std::path::{Path, PathBuf};
44170
use sysinfo::{CpuExt, CpuRefreshKind, RefreshKind, SystemExt};
45171

172+
/// a husk of a repository
173+
pub(crate) struct Repo {
174+
pub(crate) id: Id,
175+
/// The full path to the repository on disk, not yet validated to exist.
176+
pub(crate) path: PathBuf,
177+
/// The size of the object database, counted quickly by packs only.
178+
pub(crate) odb_size: ByteSize,
179+
/// The amount of objects stored in the object database.
180+
pub(crate) num_objects: u64,
181+
/// The total amount of references, no matter which type.
182+
pub(crate) num_references: usize,
183+
}
184+
185+
impl Repo {
186+
pub(crate) fn try_from(repo: &gix::Repository) -> anyhow::Result<Self> {
187+
let num_references = repo.refs.iter()?.all()?.count();
188+
let num_objects = repo.objects.packed_object_count()?;
189+
let odb_size = ByteSize(
190+
std::fs::read_dir(repo.objects.store_ref().path().join("pack"))
191+
.map(|dir| {
192+
dir.filter_map(Result::ok)
193+
.filter_map(|e| e.metadata().ok())
194+
.filter_map(|m| m.is_file().then_some(m.len()))
195+
.sum()
196+
})
197+
.unwrap_or_default(),
198+
);
199+
200+
Ok(Repo {
201+
id: 0,
202+
path: repo.path().to_owned(),
203+
odb_size,
204+
num_objects,
205+
num_references,
206+
})
207+
}
208+
}
209+
46210
/// A version to be incremented whenever the database layout is changed, to refresh it automatically.
47211
const VERSION: usize = 1;
48212

@@ -67,9 +231,11 @@ pub mod db {
67231
},
68232
_ => {}
69233
}
234+
con.execute_batch("PRAGMA synchronous = OFF;")?;
70235
con.execute_batch(
71236
r#"
72237
CREATE TABLE if not exists runner(
238+
id integer PRIMARY KEY,
73239
vendor text,
74240
brand text,
75241
host_name text, -- this is just to help ID the runner
@@ -80,22 +246,29 @@ pub mod db {
80246
con.execute_batch(
81247
r#"
82248
CREATE TABLE if not exists corpus(
249+
id integer PRIMARY KEY,
83250
root text UNIQUE -- the root path of all repositories we want to consider, as canonicalized path
84251
)
85252
"#,
86253
)?;
87254
con.execute_batch(
88255
r#"
89256
CREATE TABLE if not exists repository(
90-
rela_path text UNIQUE, -- the path to the repository on disk, relative to the corpus root path, without leading `./` or `.\`
257+
id integer PRIMARY KEY,
258+
rela_path text, -- the path to the repository on disk, relative to the corpus root path, without leading `./` or `.\`
91259
corpus integer,
92-
FOREIGN KEY (corpus) REFERENCES corpus (rowid)
260+
odb_size integer, -- the object database size in bytes
261+
num_references integer, -- the total amount of references
262+
num_objects integer, -- the total amount of objects
263+
FOREIGN KEY (corpus) REFERENCES corpus (id)
264+
UNIQUE (rela_path, corpus)
93265
)
94266
"#,
95267
)?;
96268
con.execute_batch(
97269
r#"
98270
CREATE TABLE if not exists gitoxide_version(
271+
id integer PRIMARY KEY,
99272
version text UNIQUE -- the unique git version via gix describe
100273
)
101274
"#,
@@ -109,9 +282,9 @@ pub mod db {
109282
start_time integer,
110283
end_time integer, -- or NULL if not yet finished (either successfull or with failure)
111284
error text, -- or NULL if there was on error
112-
FOREIGN KEY (repository) REFERENCES repository (rowid),
113-
FOREIGN KEY (runner) REFERENCES runner (rowid),
114-
FOREIGN KEY (gitoxide_version) REFERENCES gitoxide_version (rowid)
285+
FOREIGN KEY (repository) REFERENCES repository (id),
286+
FOREIGN KEY (runner) REFERENCES runner (id),
287+
FOREIGN KEY (gitoxide_version) REFERENCES gitoxide_version (id)
115288
)
116289
"#,
117290
)?;
@@ -129,73 +302,30 @@ pub mod db {
129302
let vendor = Some(cpu.vendor_id().to_owned());
130303
let host = sys.host_name();
131304
let brand = Some(cpu.brand().to_owned());
132-
Ok(
133-
match self
134-
.con
135-
.query_row(
136-
"SELECT rowid FROM runner WHERE vendor = ?1 AND brand = ?2",
137-
[vendor.as_deref(), brand.as_deref()],
138-
|r| r.get(0),
139-
)
140-
.optional()?
141-
{
142-
Some(existing) => existing,
143-
None => {
144-
self.con.execute(
145-
"INSERT INTO runner (vendor, brand, host_name) VALUES (?1, ?2, ?3)",
146-
[vendor.as_deref(), brand.as_deref(), host.as_deref()],
147-
)?;
148-
self.con.query_row(
149-
"SELECT rowid FROM runner WHERE vendor = ?1 AND brand = ?2",
150-
[vendor, brand],
151-
|r| r.get(0),
152-
)?
153-
}
154-
},
155-
)
305+
Ok(self.con.query_row(
306+
"INSERT INTO runner (vendor, brand, host_name) VALUES (?1, ?2, ?3) \
307+
ON CONFLICT DO UPDATE SET vendor = vendor, brand = brand, host_name = ?3 RETURNING id",
308+
[vendor.as_deref(), brand.as_deref(), host.as_deref()],
309+
|r| r.get(0),
310+
)?)
156311
}
157312
pub(crate) fn corpus_id_or_insert(&self, path: &Path) -> anyhow::Result<Id> {
158313
let path = path.to_str().context("corpus root cannot contain illformed UTF-8")?;
159-
Ok(
160-
match self
161-
.con
162-
.query_row("SELECT rowid FROM corpus WHERE root = ?1", [path], |r| r.get(0))
163-
.optional()?
164-
{
165-
Some(existing) => existing,
166-
None => {
167-
self.con.execute("INSERT INTO corpus (root) VALUES (?1)", [path])?;
168-
self.con
169-
.query_row("SELECT rowid FROM corpus WHERE root = ?1", [path], |r| r.get(0))?
170-
}
171-
},
172-
)
314+
Ok(self.con.query_row(
315+
"INSERT INTO corpus (root) VALUES (?1) \
316+
ON CONFLICT DO UPDATE SET root = root RETURNING id",
317+
[path],
318+
|r| r.get(0),
319+
)?)
173320
}
174321
pub(crate) fn gitoxide_version_id_or_insert(&self) -> anyhow::Result<Id> {
175-
Ok(
176-
match self
322+
Ok(self
177323
.con
178324
.query_row(
179-
"SELECT rowid FROM gitoxide_version WHERE version = ?1",
325+
"INSERT INTO gitoxide_version (version) VALUES (?1) ON CONFLICT DO UPDATE SET version = version RETURNING id",
180326
[&self.gitoxide_version],
181327
|r| r.get(0),
182-
)
183-
.optional()?
184-
{
185-
Some(existing) => existing,
186-
None => {
187-
self.con.execute(
188-
"INSERT INTO gitoxide_version (version) VALUES (?1)",
189-
[&self.gitoxide_version],
190-
)?;
191-
self.con.query_row(
192-
"SELECT rowid FROM gitoxide_version WHERE version = ?1",
193-
[&self.gitoxide_version],
194-
|r| r.get(0),
195-
)?
196-
}
197-
},
198-
)
328+
)?)
199329
}
200330
}
201331
}

gitoxide-core/src/organize.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ pub enum Mode {
1212
Simulate,
1313
}
1414

15-
fn find_git_repository_workdirs(
15+
pub fn find_git_repository_workdirs(
1616
root: impl AsRef<Path>,
1717
mut progress: impl Progress,
1818
debug: bool,

src/plumbing/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,9 @@ pub fn main() -> Result<()> {
134134
auto_verbose,
135135
progress,
136136
progress_keep_open,
137-
None,
137+
core::corpus::PROGRESS_RANGE,
138138
move |progress, _out, _err| {
139-
let engine = core::corpus::Engine::open_or_create(db, env!("GITOXIDE_VERSION").into(), progress)?;
139+
let mut engine = core::corpus::Engine::open_or_create(db, env!("GITOXIDE_VERSION").into(), progress)?;
140140
match cmd {
141141
crate::plumbing::options::corpus::SubCommands::Run => engine.run(path),
142142
}

0 commit comments

Comments
 (0)