Skip to content

Commit 8f795e8

Browse files
cesfahaniByron
authored andcommitted
feat: add basic connectivity check
Implement a simple connectivity check in a new `gix-fsck` crate, and add this to `gix` via a new `fsck` subcommand. Currently this is functionally equivalent to: `git rev-list --objects --quiet --missing=print` feat: add basic connectivity check - address review feedback
1 parent e47c46d commit 8f795e8

File tree

18 files changed

+378
-2
lines changed

18 files changed

+378
-2
lines changed

Diff for: Cargo.lock

+12
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ members = [
281281
"gix-archive",
282282
"gix-worktree-stream",
283283
"gix-revwalk",
284+
"gix-fsck",
284285

285286
"tests/tools",
286287

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ is usable to some extent.
140140
* [gix-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tui)
141141
* [gix-tix](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tix)
142142
* [gix-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bundle)
143+
* [gix-fsck](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-fsck)
143144

144145
### Stress Testing
145146
* [x] Verify huge packs

Diff for: crate-status.md

+13
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,19 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
775775
* [x] validate submodule names
776776
* [x] [validate][tagname-validation] tag names
777777

778+
### gix-fsck
779+
* [x] validate connectivity when provided a specific commit as a starting point
780+
* [ ] validate connectivity of all `refs` in the index
781+
* [ ] progress reporting and interruptability
782+
* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit)
783+
* [ ] add support for various options
784+
* [ ] write dangling objects to the `.git/log-found` directory structure
785+
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
786+
* [ ] consider reflog entries as reference nodes/heads
787+
* [ ] when reporting reachable objects, include _how_ they are reachable
788+
* [ ] which reference node(s) made them reachable
789+
* [ ] parent commit(s)
790+
778791
### gix-ref
779792
* [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.
780793
* **Stores**

Diff for: gitoxide-core/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
4949
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
5050
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
5151
gix-status = { version = "^0.2.0", path = "../gix-status" }
52+
gix-fsck = { version = "^0.0.0", path = "../gix-fsck" }
5253
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
5354
anyhow = "1.0.42"
5455
thiserror = "1.0.34"

Diff for: gitoxide-core/src/repository/fsck.rs

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
use std::io::{BufWriter, Write};
2+
3+
use anyhow::Context;
4+
use gix::{objs::Kind, ObjectId};
5+
6+
pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> {
7+
let mut out = BufWriter::with_capacity(64 * 1024, out);
8+
let spec = spec.unwrap_or("HEAD".into());
9+
10+
repo.object_cache_size_if_unset(4 * 1024 * 1024);
11+
// We expect to be finding a bunch of non-existent objects here - never refresh the ODB
12+
repo.objects.refresh_never();
13+
14+
let id = repo
15+
.rev_parse_single(spec.as_str())
16+
.context("Only single revisions are supported")?;
17+
let commits: gix::revision::Walk<'_> = id
18+
.object()?
19+
.peel_to_kind(gix::object::Kind::Commit)
20+
.context("Need commitish as starting point")?
21+
.id()
22+
.ancestors()
23+
.all()?;
24+
25+
let missing_cb = |oid: &ObjectId, kind: Kind| {
26+
writeln!(out, "{oid}: {kind}").expect("failed to write output");
27+
};
28+
let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb);
29+
30+
// Walk all commits, checking each one for connectivity
31+
for commit in commits {
32+
let commit = commit?;
33+
conn.check_commit(&commit.id);
34+
for parent in commit.parent_ids {
35+
conn.check_commit(&parent);
36+
}
37+
}
38+
39+
Ok(())
40+
}

Diff for: gitoxide-core/src/repository/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pub use clone::function::clone;
3535
pub use fetch::function::fetch;
3636

3737
pub mod commitgraph;
38+
pub mod fsck;
3839
pub mod index;
3940
pub mod mailmap;
4041
pub mod odb;

Diff for: gix-fsck/Cargo.toml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "gix-fsck"
3+
version = "0.0.0"
4+
repository = "https://github.com/Byron/gitoxide"
5+
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
6+
license = "MIT OR Apache-2.0"
7+
description = "Verifies the connectivity and validity of objects in the database"
8+
edition = "2021"
9+
include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"]
10+
rust-version = "1.65"
11+
12+
[lib]
13+
doctest = false
14+
15+
[dependencies]
16+
gix-hash = { version = "^0.13.1", path = "../gix-hash" }
17+
gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" }
18+
gix-object = { version = "^0.38.0", path = "../gix-object" }
19+
20+
[dev-dependencies]
21+
gix-odb = { path = "../gix-odb" }
22+
gix-testtools = { path = "../tests/tools"}

Diff for: gix-fsck/Changelog.md

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

Diff for: gix-fsck/LICENSE-APACHE

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../LICENSE-APACHE

Diff for: gix-fsck/LICENSE-MIT

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../LICENSE-MIT

Diff for: gix-fsck/src/lib.rs

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
//! A library for performing object database integrity and connectivity checks
2+
#![deny(rust_2018_idioms)]
3+
4+
use gix_hash::ObjectId;
5+
use gix_hashtable::HashSet;
6+
use gix_object::{tree::EntryMode, Exists, FindExt, Kind};
7+
8+
pub struct ConnectivityCheck<'a, T, F>
9+
where
10+
T: FindExt + Exists,
11+
F: FnMut(&ObjectId, Kind),
12+
{
13+
/// ODB handle to use for the check
14+
db: &'a T,
15+
/// Closure to invoke when a missing object is encountered
16+
missing_cb: F,
17+
/// Set of Object IDs already (or about to be) scanned during the check
18+
oid_set: HashSet,
19+
/// Single buffer for decoding objects from the ODB
20+
/// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements)
21+
buf: Vec<u8>,
22+
}
23+
24+
impl<'a, T, F> ConnectivityCheck<'a, T, F>
25+
where
26+
T: FindExt + Exists,
27+
F: FnMut(&ObjectId, Kind),
28+
{
29+
/// Instantiate a connectivity check
30+
pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> {
31+
ConnectivityCheck {
32+
db,
33+
missing_cb,
34+
oid_set: HashSet::default(),
35+
buf: Vec::new(),
36+
}
37+
}
38+
39+
/// Run the connectivity check on the provided commit object ID
40+
/// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB
41+
/// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped
42+
/// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`
43+
/// - Missing commits or trees will currently result in panic
44+
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
45+
pub fn check_commit(&mut self, oid: &ObjectId) {
46+
// Attempt to insert the commit ID in the set, and if already present, return immediately
47+
if !self.oid_set.insert(*oid) {
48+
return;
49+
}
50+
// Obtain the commit's tree ID
51+
let tree_id = {
52+
let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit");
53+
commit.tree()
54+
};
55+
56+
// Attempt to insert the tree ID in the set, and if already present, return immediately
57+
if self.oid_set.insert(tree_id) {
58+
self.check_tree(&tree_id);
59+
}
60+
}
61+
62+
fn check_tree(&mut self, oid: &ObjectId) {
63+
let tree = match self.db.find_tree(oid, &mut self.buf) {
64+
Ok(tree) => tree,
65+
Err(_) => {
66+
// Tree is missing, so invoke `missing_cb`
67+
(self.missing_cb)(oid, Kind::Tree);
68+
return;
69+
}
70+
};
71+
72+
// Keeping separate sets for trees and blobs for now...
73+
// This is about a wash when compared to using a HashMap<ObjectID, Kind>
74+
struct TreeEntries {
75+
trees: HashSet<ObjectId>,
76+
blobs: HashSet<ObjectId>,
77+
}
78+
79+
// Build up a set of trees and a set of blobs
80+
let entries: TreeEntries = {
81+
let mut entries = TreeEntries {
82+
trees: HashSet::default(),
83+
blobs: HashSet::default(),
84+
};
85+
86+
// For each entry in the tree
87+
for entry_ref in tree.entries.iter() {
88+
match entry_ref.mode {
89+
EntryMode::Tree => {
90+
let tree_id = entry_ref.oid.to_owned();
91+
// Check if the tree has already been encountered
92+
if self.oid_set.insert(tree_id) {
93+
entries.trees.insert(tree_id);
94+
}
95+
}
96+
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
97+
let blob_id = entry_ref.oid.to_owned();
98+
// Check if the blob has already been encountered
99+
if self.oid_set.insert(blob_id) {
100+
entries.blobs.insert(blob_id);
101+
}
102+
}
103+
EntryMode::Commit => {
104+
// This implies a submodule (OID is the commit hash of the submodule)
105+
// Skip it as it's not in this repository!
106+
}
107+
}
108+
}
109+
entries
110+
};
111+
112+
for tree_id in entries.trees.iter() {
113+
self.check_tree(tree_id);
114+
}
115+
for blob_id in entries.blobs.iter() {
116+
self.check_blob(blob_id);
117+
}
118+
}
119+
120+
fn check_blob(&mut self, oid: &ObjectId) {
121+
// Check if the blob is missing from the ODB
122+
if !self.db.exists(oid) {
123+
// Blob is missing, so invoke `missing_cb`
124+
(self.missing_cb)(oid, Kind::Blob);
125+
}
126+
}
127+
}

Diff for: gix-fsck/tests/connectivity/mod.rs

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
use std::ops::Deref;
2+
3+
use gix_fsck::ConnectivityCheck;
4+
use gix_hash::ObjectId;
5+
use gix_hashtable::HashMap;
6+
use gix_object::Kind;
7+
use gix_testtools::once_cell::sync::Lazy;
8+
9+
use crate::hex_to_id;
10+
11+
fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator<Item = &'a ObjectId>) -> HashMap<ObjectId, Kind> {
12+
let db = {
13+
let fixture_path = gix_testtools::scripted_fixture_read_only("make_test_repos.sh")
14+
.expect("fixture path")
15+
.join(repo_name)
16+
.join(".git")
17+
.join("objects");
18+
let mut db = gix_odb::at(fixture_path).expect("odb handle");
19+
db.refresh_never();
20+
db
21+
};
22+
23+
let mut missing: HashMap<ObjectId, Kind> = HashMap::default();
24+
let callback = |oid: &ObjectId, kind: Kind| {
25+
missing.try_insert(*oid, kind).expect("no duplicate oid");
26+
};
27+
28+
let mut check = ConnectivityCheck::new(&db, callback);
29+
for commit in commits.into_iter() {
30+
check.check_commit(commit);
31+
}
32+
33+
missing
34+
}
35+
36+
fn hex_to_ids<'a>(hex_ids: impl IntoIterator<Item = &'a str>) -> Vec<ObjectId> {
37+
hex_ids.into_iter().map(hex_to_id).collect()
38+
}
39+
40+
fn hex_to_objects<'a>(hex_ids: impl IntoIterator<Item = &'a str>, kind: Kind) -> HashMap<ObjectId, Kind> {
41+
hex_to_ids(hex_ids)
42+
.into_iter()
43+
.map(|id| (id, kind))
44+
.collect::<HashMap<_, _>>()
45+
}
46+
47+
// Get a `&Vec<ObjectID` for each commit in the test fixture repository
48+
fn all_commits<'a>() -> &'a Vec<ObjectId> {
49+
static ALL_COMMITS: Lazy<Vec<ObjectId>> = Lazy::new(|| {
50+
hex_to_ids(vec![
51+
"5d18db2e2aabadf7b914435ef34f2faf8b4546dd",
52+
"3a3dfaa55a515f3fb3a25751107bbb523af6a1b0",
53+
"734c926856a328d1168ffd7088532e0d1ad19bbe",
54+
])
55+
});
56+
ALL_COMMITS.deref()
57+
}
58+
59+
#[test]
60+
fn no_missing() {
61+
// The "base" repo is the original, and has every object present
62+
assert_eq!(check_missing("base", all_commits()), HashMap::default());
63+
}
64+
65+
#[test]
66+
fn missing_blobs() {
67+
// The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob
68+
let expected = hex_to_objects(vec!["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob);
69+
assert_eq!(check_missing("blobless", all_commits()), expected);
70+
}
71+
72+
#[test]
73+
fn missing_trees() {
74+
// The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees
75+
// NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing
76+
let expected = hex_to_objects(
77+
vec![
78+
"9561cfbae43c5e2accdfcd423378588dd10d827f",
79+
"fc264b3b6875a46e9031483aeb9994a1b897ffd3",
80+
],
81+
Kind::Tree,
82+
);
83+
assert_eq!(check_missing("treeless", all_commits()), expected);
84+
}
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
make_test_repos.tar.xz

0 commit comments

Comments
 (0)