Skip to content

Commit a4300c8

Browse files
committed
obtain a repository-list with classifiers
If the obtained list is empty, we update against that empty list which fills the repositories tables for the first time. Additionally we provide a new sub-command to refresh the list (additions only).
1 parent fcbda1d commit a4300c8

File tree

4 files changed

+205
-72
lines changed

4 files changed

+205
-72
lines changed

gitoxide-core/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ estimate-hours = ["dep:itertools", "dep:fs-err", "dep:crossbeam-channel", "dep:s
2222
## Gather information about repositories and store it in a database for easy querying.
2323
query = ["dep:rusqlite"]
2424
## Run algorithms on a corpus of repositories and store their results for later comparison and intelligence gathering.
25-
corpus = ["dep:rusqlite", "dep:sysinfo"]
25+
## *Note that* `organize` we need for finding git repositories fast.
26+
corpus = ["dep:rusqlite", "dep:sysinfo", "organize", "dep:crossbeam-channel"]
2627

2728
#! ### Mutually Exclusive Networking
2829
#! If both are set, _blocking-client_ will take precedence, allowing `--all-features` to be used.

gitoxide-core/src/corpus/mod.rs

Lines changed: 200 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,25 @@
1+
pub const PROGRESS_RANGE: std::ops::RangeInclusive<u8> = 0..=3;
2+
13
pub struct Engine<P> {
24
progress: P,
35
con: rusqlite::Connection,
46
gitoxide_version: String,
57
}
68

9+
pub struct RunOutcome {
10+
/// the relative path to the repositories that could not be found on disk
11+
pub missing_repos_rela_paths: usize,
12+
}
13+
714
pub mod engine {
15+
use super::db;
816
use crate::corpus::Engine;
17+
use crate::organize::find_git_repository_workdirs;
918
use anyhow::Context;
10-
use std::path::PathBuf;
19+
use bytesize::ByteSize;
20+
use rusqlite::params;
21+
use std::path::{Path, PathBuf};
22+
use std::time::Instant;
1123

1224
pub(crate) type Id = u32;
1325

@@ -26,11 +38,126 @@ pub mod engine {
2638
}
2739

2840
/// Run on the existing set of repositories we have already seen or obtain them from `path` if there is none yet.
29-
pub fn run(&self, path: PathBuf) -> anyhow::Result<()> {
30-
let _corpus_id = self.corpus_id_or_insert(&path)?;
41+
pub fn run(&mut self, corpus_path: PathBuf) -> anyhow::Result<()> {
42+
let corpus_path = gix::path::realpath(corpus_path)?;
43+
let corpus_id = self.corpus_id_or_insert(&corpus_path)?;
3144
let _gitoxide_id = self.gitoxide_version_id_or_insert()?;
3245
let _runner_id = self.runner_id_or_insert()?;
33-
todo!()
46+
let _repos = self.find_repos_or_insert(&corpus_path, corpus_id)?;
47+
todo!("do run on repos")
48+
}
49+
50+
fn find_repos(&mut self, corpus_id: Id) -> anyhow::Result<Vec<db::Repo>> {
51+
self.progress.set_name("query db-repos");
52+
self.progress.init(None, gix::progress::count("repos"));
53+
54+
Ok(self
55+
.con
56+
.prepare(
57+
"SELECT id, rela_path, odb_size, num_objects, num_references FROM repository WHERE corpus = ?1",
58+
)?
59+
.query_map([corpus_id], |r| {
60+
Ok(db::Repo {
61+
id: r.get(0)?,
62+
path: r.get::<_, String>(1)?.into(),
63+
odb_size: ByteSize(r.get(2)?),
64+
num_objects: r.get(3)?,
65+
num_references: r.get(4)?,
66+
})
67+
})?
68+
.inspect(|_| self.progress.inc())
69+
.collect::<Result<_, _>>()?)
70+
}
71+
72+
fn refresh_repos(&mut self, corpus_path: &Path, corpus_id: Id) -> anyhow::Result<Vec<db::Repo>> {
73+
let start = Instant::now();
74+
self.progress.set_name("refresh");
75+
self.progress.init(None, gix::progress::count("repos"));
76+
77+
let repos = std::thread::scope({
78+
let progress = &mut self.progress;
79+
let con = &mut self.con;
80+
|scope| -> anyhow::Result<_> {
81+
let threads = std::thread::available_parallelism()
82+
.map(std::num::NonZeroUsize::get)
83+
.ok()
84+
.unwrap_or(1);
85+
let (path_tx, repo_rx) = {
86+
let (path_tx, path_rx) = crossbeam_channel::bounded(threads * 2);
87+
let (repo_tx, repo_rx) = std::sync::mpsc::channel::<(PathBuf, anyhow::Result<db::Repo>)>();
88+
(0..threads).for_each(|_| {
89+
scope.spawn({
90+
let path_rx = path_rx.clone();
91+
let repo_tx = repo_tx.clone();
92+
move || -> anyhow::Result<_> {
93+
for repo_path in path_rx {
94+
let res = (|| {
95+
let repo = gix::open_opts(&repo_path, gix::open::Options::isolated())?;
96+
db::Repo::try_from(&repo)
97+
})();
98+
repo_tx.send((repo_path, res))?;
99+
}
100+
Ok(())
101+
}
102+
});
103+
});
104+
(path_tx, repo_rx)
105+
};
106+
107+
let find_progress = progress.add_child("find");
108+
let write_db = scope.spawn(move || -> anyhow::Result<Vec<db::Repo>> {
109+
progress.set_name("write to DB");
110+
progress.init(None, gix::progress::count("repos"));
111+
let start = Instant::now();
112+
113+
let mut out = Vec::new();
114+
let transaction = con.transaction()?;
115+
let mut statement = transaction.prepare("INSERT INTO repository (rela_path, corpus, odb_size, num_objects, num_references) VALUES (?1, ?2, ?3, ?4, ?5)\
116+
ON CONFLICT DO UPDATE SET rela_path = rela_path, corpus = corpus, odb_size = ?3, num_objects = ?4, num_references = ?5\
117+
RETURNING id")?;
118+
for (repo_path, repo_res) in repo_rx {
119+
match repo_res {
120+
Ok(mut repo) => {
121+
let rela_path = repo.path.strip_prefix(corpus_path)?;
122+
repo.id = statement.query_row(params![rela_path.to_str().context("only valid UTF8 is allowed for repository paths")?, corpus_id, repo.odb_size.as_u64(), repo.num_objects, repo.num_references], |r| r.get(0))?;
123+
out.push(repo);
124+
progress.inc();
125+
}
126+
Err(err) => progress.fail(format!("{repo_path:?}: {err:#?}")),
127+
}
128+
}
129+
statement.finalize()?;
130+
transaction.commit()?;
131+
progress.show_throughput(start);
132+
Ok(out)
133+
});
134+
135+
let repos = gix::interrupt::Iter::new(
136+
find_git_repository_workdirs(corpus_path, find_progress, false, Some(threads)),
137+
|| anyhow::anyhow!("interrupted by user"),
138+
);
139+
for res in repos {
140+
let (repo_path, _kind) = res?;
141+
path_tx.send(repo_path)?;
142+
}
143+
drop(path_tx);
144+
write_db.join().expect("no panic")
145+
}
146+
})?;
147+
148+
self.progress.show_throughput(start);
149+
Ok(repos)
150+
}
151+
152+
fn find_repos_or_insert(&mut self, corpus_path: &Path, corpus_id: Id) -> anyhow::Result<Vec<db::Repo>> {
153+
let start = Instant::now();
154+
let repos = self.find_repos(corpus_id)?;
155+
if repos.is_empty() {
156+
self.refresh_repos(corpus_path, corpus_id)
157+
} else {
158+
self.progress.show_throughput(start);
159+
Ok(repos)
160+
}
34161
}
35162
}
36163
}
@@ -39,10 +166,49 @@ pub mod db {
39166
use crate::corpus::engine::Id;
40167
use crate::corpus::Engine;
41168
use anyhow::{bail, Context};
169+
use bytesize::ByteSize;
42170
use rusqlite::{params, OptionalExtension};
43-
use std::path::Path;
171+
use std::path::{Path, PathBuf};
44172
use sysinfo::{CpuExt, CpuRefreshKind, RefreshKind, SystemExt};
45173

174+
/// a husk of a repository
175+
pub(crate) struct Repo {
176+
pub(crate) id: Id,
177+
/// The full path to the repository on disk, not yet validated to exist.
178+
pub(crate) path: PathBuf,
179+
/// The size of the object database, counted quickly by packs only.
180+
pub(crate) odb_size: ByteSize,
181+
/// The amount of objects stored in the object database.
182+
pub(crate) num_objects: u64,
183+
/// The total amount of references, no matter which type.
184+
pub(crate) num_references: usize,
185+
}
186+
187+
impl Repo {
188+
pub(crate) fn try_from(repo: &gix::Repository) -> anyhow::Result<Self> {
189+
let num_references = repo.refs.iter()?.all()?.count();
190+
let num_objects = repo.objects.packed_object_count()?;
191+
let odb_size = ByteSize(
192+
std::fs::read_dir(repo.objects.store_ref().path().join("pack"))
193+
.map(|dir| {
194+
dir.filter_map(Result::ok)
195+
.filter_map(|e| e.metadata().ok())
196+
.filter_map(|m| m.is_file().then_some(m.len()))
197+
.sum()
198+
})
199+
.unwrap_or_default(),
200+
);
201+
202+
Ok(Repo {
203+
id: 0,
204+
path: repo.path().to_owned(),
205+
odb_size,
206+
num_objects,
207+
num_references,
208+
})
209+
}
210+
}
211+
46212
/// A version to be incremented whenever the database layout is changed, to refresh it automatically.
47213
const VERSION: usize = 1;
48214

@@ -67,9 +233,11 @@ pub mod db {
67233
},
68234
_ => {}
69235
}
236+
con.execute_batch("PRAGMA synchronous = OFF;")?;
70237
con.execute_batch(
71238
r#"
72239
CREATE TABLE if not exists runner(
240+
id integer PRIMARY KEY,
73241
vendor text,
74242
brand text,
75243
host_name text, -- this is just to help ID the runner
@@ -80,22 +248,29 @@ pub mod db {
80248
con.execute_batch(
81249
r#"
82250
CREATE TABLE if not exists corpus(
251+
id integer PRIMARY KEY,
83252
root text UNIQUE -- the root path of all repositories we want to consider, as canonicalized path
84253
)
85254
"#,
86255
)?;
87256
con.execute_batch(
88257
r#"
89258
CREATE TABLE if not exists repository(
90-
rela_path text UNIQUE, -- the path to the repository on disk, relative to the corpus root path, without leading `./` or `.\`
259+
id integer PRIMARY KEY,
260+
rela_path text, -- the path to the repository on disk, relative to the corpus root path, without leading `./` or `.\`
91261
corpus integer,
92-
FOREIGN KEY (corpus) REFERENCES corpus (rowid)
262+
odb_size integer, -- the object database size in bytes
263+
num_references integer, -- the total amount of references
264+
num_objects integer, -- the total amount of objects
265+
FOREIGN KEY (corpus) REFERENCES corpus (id)
266+
UNIQUE (rela_path, corpus)
93267
)
94268
"#,
95269
)?;
96270
con.execute_batch(
97271
r#"
98272
CREATE TABLE if not exists gitoxide_version(
273+
id integer PRIMARY KEY,
99274
version text UNIQUE -- the unique git version via gix describe
100275
)
101276
"#,
@@ -109,9 +284,9 @@ pub mod db {
109284
start_time integer,
110285
end_time integer, -- or NULL if not yet finished (either successfull or with failure)
111286
error text, -- or NULL if there was on error
112-
FOREIGN KEY (repository) REFERENCES repository (rowid),
113-
FOREIGN KEY (runner) REFERENCES runner (rowid),
114-
FOREIGN KEY (gitoxide_version) REFERENCES gitoxide_version (rowid)
287+
FOREIGN KEY (repository) REFERENCES repository (id),
288+
FOREIGN KEY (runner) REFERENCES runner (id),
289+
FOREIGN KEY (gitoxide_version) REFERENCES gitoxide_version (id)
115290
)
116291
"#,
117292
)?;
@@ -129,73 +304,30 @@ pub mod db {
129304
let vendor = Some(cpu.vendor_id().to_owned());
130305
let host = sys.host_name();
131306
let brand = Some(cpu.brand().to_owned());
132-
Ok(
133-
match self
134-
.con
135-
.query_row(
136-
"SELECT rowid FROM runner WHERE vendor = ?1 AND brand = ?2",
137-
[vendor.as_deref(), brand.as_deref()],
138-
|r| r.get(0),
139-
)
140-
.optional()?
141-
{
142-
Some(existing) => existing,
143-
None => {
144-
self.con.execute(
145-
"INSERT INTO runner (vendor, brand, host_name) VALUES (?1, ?2, ?3)",
146-
[vendor.as_deref(), brand.as_deref(), host.as_deref()],
147-
)?;
148-
self.con.query_row(
149-
"SELECT rowid FROM runner WHERE vendor = ?1 AND brand = ?2",
150-
[vendor, brand],
151-
|r| r.get(0),
152-
)?
153-
}
154-
},
155-
)
307+
Ok(self.con.query_row(
308+
"INSERT INTO runner (vendor, brand, host_name) VALUES (?1, ?2, ?3) \
309+
ON CONFLICT DO UPDATE SET vendor = vendor, brand = brand, host_name = ?3 RETURNING id",
310+
[vendor.as_deref(), brand.as_deref(), host.as_deref()],
311+
|r| r.get(0),
312+
)?)
156313
}
157314
pub(crate) fn corpus_id_or_insert(&self, path: &Path) -> anyhow::Result<Id> {
158315
let path = path.to_str().context("corpus root cannot contain illformed UTF-8")?;
159-
Ok(
160-
match self
161-
.con
162-
.query_row("SELECT rowid FROM corpus WHERE root = ?1", [path], |r| r.get(0))
163-
.optional()?
164-
{
165-
Some(existing) => existing,
166-
None => {
167-
self.con.execute("INSERT INTO corpus (root) VALUES (?1)", [path])?;
168-
self.con
169-
.query_row("SELECT rowid FROM corpus WHERE root = ?1", [path], |r| r.get(0))?
170-
}
171-
},
172-
)
316+
Ok(self.con.query_row(
317+
"INSERT INTO corpus (root) VALUES (?1) \
318+
ON CONFLICT DO UPDATE SET root = root RETURNING id",
319+
[path],
320+
|r| r.get(0),
321+
)?)
173322
}
174323
pub(crate) fn gitoxide_version_id_or_insert(&self) -> anyhow::Result<Id> {
175-
Ok(
176-
match self
324+
Ok(self
177325
.con
178326
.query_row(
179-
"SELECT rowid FROM gitoxide_version WHERE version = ?1",
327+
"INSERT INTO gitoxide_version (version) VALUES (?1) ON CONFLICT DO UPDATE SET version = version RETURNING id",
180328
[&self.gitoxide_version],
181329
|r| r.get(0),
182-
)
183-
.optional()?
184-
{
185-
Some(existing) => existing,
186-
None => {
187-
self.con.execute(
188-
"INSERT INTO gitoxide_version (version) VALUES (?1)",
189-
[&self.gitoxide_version],
190-
)?;
191-
self.con.query_row(
192-
"SELECT rowid FROM gitoxide_version WHERE version = ?1",
193-
[&self.gitoxide_version],
194-
|r| r.get(0),
195-
)?
196-
}
197-
},
198-
)
330+
)?)
199331
}
200332
}
201333
}

gitoxide-core/src/organize.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ pub enum Mode {
1212
Simulate,
1313
}
1414

15-
fn find_git_repository_workdirs(
15+
pub fn find_git_repository_workdirs(
1616
root: impl AsRef<Path>,
1717
mut progress: impl Progress,
1818
debug: bool,

src/plumbing/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,9 @@ pub fn main() -> Result<()> {
134134
auto_verbose,
135135
progress,
136136
progress_keep_open,
137-
None,
137+
core::corpus::PROGRESS_RANGE,
138138
move |progress, _out, _err| {
139-
let engine = core::corpus::Engine::open_or_create(db, env!("GITOXIDE_VERSION").into(), progress)?;
139+
let mut engine = core::corpus::Engine::open_or_create(db, env!("GITOXIDE_VERSION").into(), progress)?;
140140
match cmd {
141141
crate::plumbing::options::corpus::SubCommands::Run => engine.run(path),
142142
}

0 commit comments

Comments
 (0)