Open
Description
The problem is that the callback wrapper in the walk()
implementation assumes utf-8 for the root name. However, this is not required by base Git or libgit2. Other git2-rs methods appear to correctly handle non-utf-8 names by allowing paths to be accessed as byte slices.
Here's a self-contained demonstration of the issue which also shows base git behavior:
Cargo.toml
[package]
name = "bad-encoding"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.80"
base64 = "0.21.7"
git2 = "0.18.2"
rand = "0.8.5"
src/main.rs
:
use std::{
path::{Path, PathBuf},
process::Command,
};
use anyhow::{anyhow, Context, Result};
use git2::{
ObjectType, Oid, Repository, RepositoryInitOptions, Signature, Tree, TreeBuilder,
TreeWalkResult,
};
use rand::RngCore;
fn main() -> Result<()> {
let mut rng = rand::thread_rng();
let temp_dir = create_temp_dir(&mut rng)?;
println!("Creating repo at {temp_dir:?}");
let mut options = RepositoryInitOptions::new();
// NOTE: We use a bare repo to avoid creating a working tree and
// materializing any pathological file names. Git, however, supports
// almost-arbitrary filenames internally (excluding embedded NUL and some
// other special characters, depending on the mode). libgit2 also supports
// arbitrary encoding with some exceptions:
// https://github.com/libgit2/libgit2/blob/54bfab93bffa4f98ee37e3730c8ed4061a1bc5db/src/util/fs_path.c#L1579.
options.bare(true).initial_head("main");
let repo = Repository::init_opts(&temp_dir, &options)?;
println!("Populating repo");
let spec = vec![
TreeSpec::blob(b"a.txt", b"Toplevel file. This is walkable because it's traversed _before_ the broken subtree."),
TreeSpec::blob(b"b\xc0.txt", b"Toplevel file. This is walkable as above, but it also has a filename which is not valid utf-8."),
// Note that the tree below (with an invalid utf-8 name) can be _listed_
// but not _traversed_ by the current walk implementation.
TreeSpec::tree(b"c\xc0", vec![
TreeSpec::blob(b"nested.txt", b"Not walkable due to the parent directory name being non-utf-8 bytes. This will abort the walk."),
]),
TreeSpec::blob(b"d.txt", b"Not walkable due to the broken tree above ending the walk."),
];
populate_repo(&repo, &spec)?;
println!("Repo populated");
println!();
println!("Walking via git2:");
walk_repo(&repo)?;
println!();
println!("Listing committed files using git command:");
list_repo(temp_dir.as_path())?;
Ok(())
}
fn create_temp_dir(rng: impl RngCore) -> Result<PathBuf> {
let mut rng = rng;
let toplevel = std::env::temp_dir();
let prefix = "git-repo";
for _ in 0..1000 {
let suffix = random_suffix(&mut rng);
let path = Path::join(toplevel.as_path(), format!("{prefix}-{suffix}.git"));
match std::fs::create_dir(path.as_path()) {
Ok(()) => return Ok(path),
Err(e) => match e.kind() {
std::io::ErrorKind::AlreadyExists => (),
_ => return Err(e.into()),
},
}
}
Err(anyhow!("could not allocate a unique random dir"))
}
fn random_suffix(rng: impl RngCore) -> String {
use base64::engine::general_purpose::URL_SAFE_NO_PAD;
use base64::Engine;
let mut rng = rng;
let mut buf = [0; 4];
rng.fill_bytes(&mut buf);
URL_SAFE_NO_PAD.encode(&buf)
}
fn populate_repo(repo: &Repository, spec: &[TreeSpec]) -> Result<()> {
let mut root = repo.treebuilder(None)?;
for item in spec {
add_to_tree(repo, &mut root, item)?;
}
let tree_oid = root.write()?;
let tree = repo.find_tree(tree_oid)?;
let signature = Signature::now("foo", "[email protected]")?;
let message = "Commit to hold reference";
repo.commit(Some("HEAD"), &signature, &signature, message, &tree, &[])?;
Ok(())
}
fn add_to_tree(repo: &Repository, root: &mut TreeBuilder, tree_spec: &TreeSpec) -> Result<()> {
match tree_spec {
TreeSpec::Tree { name, children } => {
let mut tree_builder = repo.treebuilder(None)?;
for child in children {
add_to_tree(repo, &mut tree_builder, child)?;
}
let oid = tree_builder.write()?;
root.insert(name, oid, 0o040000)?;
}
TreeSpec::Blob { name, contents } => {
let oid = repo.blob(contents)?;
root.insert(name, oid, 0o100644)?;
}
}
Ok(())
}
fn walk_repo(repo: &Repository) -> Result<()> {
let mut revwalk = repo.revwalk()?;
revwalk.push_head()?;
for oid in revwalk {
let oid = oid?;
describe_commit(&repo, oid).with_context(|| format!("could not inspect commit {oid}"))?;
}
Ok(())
}
fn describe_commit(repo: &Repository, oid: Oid) -> Result<()> {
let commit = repo.find_commit(oid)?;
let summary = commit
.summary()
.ok_or_else(|| anyhow!("could not open commit summary"))
.with_context(|| format!("reading oid {oid}"))?;
println!("{oid}: {summary}");
let tree = commit.tree()?;
inspect_tree(repo, &tree)?;
Ok(())
}
fn inspect_tree(repo: &Repository, tree: &Tree) -> Result<()> {
// Note that the signature below _assumes_ that `root` can always be coerced
// to `&str` (i.e., utf-8). Due to the implementation, in practice, the
// result is that subtrees containing non-utf-8 encodings are always
// silently skipped while walking. Note that basenames (`filename` below)
// are correctly handled as bytes due to `TreeEntry`'s interface.
tree.walk(git2::TreeWalkMode::PreOrder, |root, entry| {
let filename = String::from_utf8_lossy(entry.name_bytes());
let path = format!("{root}{filename}");
if let Ok(object) = entry.to_object(repo) {
match object.kind() {
Some(ObjectType::Blob) => {
let blob = object.as_blob().unwrap();
if blob.is_binary() {
println!(" {path}: <binary>")
} else {
println!(" {path}:");
let contents = String::from_utf8_lossy(blob.content());
for line in contents.lines() {
println!(" > {line}");
}
}
}
Some(ObjectType::Tree) => {
println!(" {path}: <tree>");
}
None => {
println!("{path} had no object type");
}
_ => {
println!("{path} was an unexpected object type (not a blob or tree)");
}
}
// Is it possible to return a result from the callback? It would be
// much better ergonomically if we could.
TreeWalkResult::Ok
} else {
println!("ERROR: could not get object for {path}");
TreeWalkResult::Abort
}
})?;
Ok(())
}
fn list_repo(path: &Path) -> Result<()> {
let mut child = Command::new("git")
.arg("-C")
.arg(path)
.arg("ls-tree")
.arg("-r")
.arg("main")
.spawn()?;
let result = child.wait()?;
if result.success() {
Ok(())
} else {
if let Some(code) = result.code() {
Err(anyhow!("git failed with status code {code}"))
} else {
Err(anyhow!("git failed"))
}
}
}
enum TreeSpec {
Tree {
name: &'static [u8],
children: Vec<TreeSpec>,
},
Blob {
name: &'static [u8],
contents: &'static [u8],
},
}
impl TreeSpec {
fn tree(name: &'static [u8], children: Vec<TreeSpec>) -> TreeSpec {
TreeSpec::Tree { name, children }
}
fn blob(name: &'static [u8], contents: &'static [u8]) -> TreeSpec {
TreeSpec::Blob { name, contents }
}
}
Sample invocation:
> cargo run --release
Finished release [optimized] target(s) in 0.23s
Running `target/release/bad-encoding`
Creating repo at "/var/folders/0h/1lcyrkv11ynfjq5w7bt0sl_00000gn/T/git-repo-kDBu5A.git"
Populating repo
Repo populated
Walking via git2:
6cdc2a2f7544639226efe4e872ddd2ec42fb70a0: Commit to hold reference
a.txt:
> Toplevel file. This is walkable because it's traversed _before_ the broken subtree.
b�.txt:
> Toplevel file. This is walkable as above, but it also has a filename which is not valid utf-8.
c�: <tree>
Listing committed files using git command:
100644 blob 596c677536cbdd42ed208abb4135afcb95bd967c a.txt
100644 blob 330e18bbaa623998f17ce703cdcf7158915c6bc5 "b\300.txt"
100644 blob 5fd5bb9cddb4076e8786e90670ba46e621174482 "c\300/nested.txt"
100644 blob 07bd6651bc96c9554dab80f1f15b867d4091116a d.txt
The example binary does not delete the bare git repo afterward so you can easily inspect it.
Metadata
Metadata
Assignees
Labels
No labels