Skip to content

Commit 3ae1f2f

Browse files
committed
refactor(doc_loader): Refine HTML file filtering logic
Implement filtering based on duplicate basenames, keeping the largest file. Ensure the root index.html is always included. Exclude files within src/ directories.
1 parent 5d9f4f0 commit 3ae1f2f

File tree

3 files changed

+119
-45
lines changed

3 files changed

+119
-45
lines changed

Cargo.lock

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "rustdocs_mcp_server"
3-
version = "1.0.4"
3+
version = "1.1.0"
44
edition = "2024"
55

66
[dependencies]

src/doc_loader.rs

+117-43
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use scraper::{Html, Selector};
2-
use std::{fs::{self, File, create_dir_all}, io::Write, path::PathBuf}; // Added PathBuf
2+
use std::{collections::HashMap, fs::{self, File, create_dir_all}, io::Write, path::PathBuf}; // Added PathBuf, HashMap
33
use cargo::core::resolver::features::CliFeatures;
44
// use cargo::core::SourceId; // Removed unused import
55
// use cargo::util::Filesystem; // Removed unused import
@@ -25,8 +25,7 @@ pub enum DocLoaderError {
2525
TempDirCreationFailed(std::io::Error),
2626
#[error("Cargo library error: {0}")]
2727
CargoLib(#[from] AnyhowError), // Re-add CargoLib variant
28-
#[error("Failed to strip prefix '{prefix}' from path '{path}': {source}")] // Improved error
29-
StripPrefix { prefix: PathBuf, path: PathBuf, source: std::path::StripPrefixError },
28+
// Removed unused StripPrefix variant
3029
}
3130

3231
// Simple struct to hold document content, maybe add path later if needed
@@ -36,6 +35,7 @@ pub struct Document {
3635
pub content: String,
3736
}
3837

38+
3939
/// Generates documentation for a given crate in a temporary directory,
4040
/// then loads and parses the HTML documents.
4141
/// Extracts text content from the main content area of rustdoc generated HTML.
@@ -44,23 +44,12 @@ pub fn load_documents(
4444
crate_version_req: &str,
4545
features: Option<&Vec<String>>, // Add optional features parameter
4646
) -> Result<Vec<Document>, DocLoaderError> {
47-
eprintln!(
48-
"[DEBUG] load_documents called with crate_name: '{}', crate_version_req: '{}', features: {:?}",
49-
crate_name, crate_version_req, features
50-
);
5147
let mut documents = Vec::new();
5248

5349
let temp_dir = tempdir().map_err(DocLoaderError::TempDirCreationFailed)?;
5450
let temp_dir_path = temp_dir.path();
5551
let temp_manifest_path = temp_dir_path.join("Cargo.toml");
5652

57-
eprintln!(
58-
"Generating documentation for crate '{}' (Version Req: '{}', Features: {:?}) in temporary directory: {}",
59-
crate_name,
60-
crate_version_req,
61-
features, // Log features
62-
temp_dir_path.display()
63-
);
6453

6554
// Create a temporary Cargo.toml using the version requirement and features
6655
let features_string = features
@@ -159,7 +148,6 @@ edition = "2021"
159148

160149
let docs_path = match (found_count, target_docs_path) {
161150
(1, Some(path)) => {
162-
eprintln!("[DEBUG] Confirmed unique documentation directory: {}", path.display());
163151
path
164152
},
165153
(0, _) => {
@@ -184,57 +172,143 @@ edition = "2021"
184172
let content_selector = Selector::parse("section#main-content.content")
185173
.map_err(|e| DocLoaderError::Selector(e.to_string()))?;
186174

187-
for entry in WalkDir::new(&docs_path)
175+
// --- Collect all HTML file paths first ---
176+
let all_html_paths: Vec<PathBuf> = WalkDir::new(&docs_path)
188177
.into_iter()
189-
.filter_map(Result::ok) // Ignore errors during iteration for now
190-
.filter(|e| !e.file_type().is_dir() && e.path().extension().is_some_and(|ext| ext == "html"))
191-
{
192-
let path = entry.path();
193-
// Calculate path relative to the docs_path root
194-
let relative_path = path.strip_prefix(&docs_path).map_err(|e| {
195-
// Provide more context in the error message using the new error variant
196-
DocLoaderError::StripPrefix {
197-
prefix: docs_path.to_path_buf(),
198-
path: path.to_path_buf(),
199-
source: e,
178+
.filter_map(Result::ok) // Ignore errors during iteration
179+
.filter(|e| {
180+
!e.file_type().is_dir() && e.path().extension().is_some_and(|ext| ext == "html")
181+
})
182+
.map(|e| e.into_path()) // Get the PathBuf
183+
.collect();
184+
185+
eprintln!("[DEBUG] Found {} total HTML files initially.", all_html_paths.len());
186+
187+
// --- Group files by basename ---
188+
let mut basename_groups: HashMap<String, Vec<PathBuf>> = HashMap::new();
189+
for path in all_html_paths {
190+
if let Some(filename_osstr) = path.file_name() {
191+
if let Some(filename_str) = filename_osstr.to_str() {
192+
basename_groups
193+
.entry(filename_str.to_string())
194+
.or_default()
195+
.push(path);
196+
} else {
197+
eprintln!("[WARN] Skipping file with non-UTF8 name: {}", path.display());
200198
}
201-
})?;
202-
let path_str = relative_path.to_string_lossy().to_string(); // Use the relative path
203-
// eprintln!("Processing file: {} (relative: {})", path.display(), path_str); // Updated debug log
199+
} else {
200+
eprintln!("[WARN] Skipping file with no name: {}", path.display());
201+
}
202+
}
203+
204+
// --- Initialize paths_to_process and explicitly add the root index.html if it exists ---
205+
let mut paths_to_process: Vec<PathBuf> = Vec::new();
206+
let root_index_path = docs_path.join("index.html");
207+
if root_index_path.is_file() {
208+
paths_to_process.push(root_index_path);
209+
}
210+
211+
// --- Filter based on duplicates and size ---
212+
// NOTE: Initialization of paths_to_process moved before this loop
213+
for (basename, mut paths) in basename_groups {
214+
// Always ignore index.html at this stage (except the root one added earlier)
215+
if basename == "index.html" {
216+
continue;
217+
}
218+
219+
// Also ignore files within source code view directories
220+
// Check the first path (they should share the problematic component if any)
221+
if paths.first().map_or(false, |p| p.components().any(|comp| comp.as_os_str() == "src")) {
222+
continue;
223+
}
224+
225+
226+
if paths.len() == 1 {
227+
// Single file with this basename (and not index.html), keep it
228+
paths_to_process.push(paths.remove(0));
229+
} else {
230+
// Multiple files with the same basename (duplicates)
231+
// Find the largest one by file size
232+
// Explicit type annotation needed for the error type in try_fold
233+
let largest_path_result: Result<Option<(PathBuf, u64)>, std::io::Error> = paths.into_iter().try_fold(None::<(PathBuf, u64)>, |largest, current| {
234+
let current_meta = fs::metadata(&current)?;
235+
let current_size = current_meta.len();
236+
match largest {
237+
None => Ok(Some((current, current_size))),
238+
Some((largest_path_so_far, largest_size_so_far)) => {
239+
if current_size > largest_size_so_far {
240+
Ok(Some((current, current_size)))
241+
} else {
242+
Ok(Some((largest_path_so_far, largest_size_so_far)))
243+
}
244+
}
245+
}
246+
});
247+
248+
match largest_path_result {
249+
Ok(Some((p, _size))) => {
250+
// eprintln!("[DEBUG] Duplicate basename '{}': Keeping largest file {}", basename, p.display());
251+
paths_to_process.push(p);
252+
}
253+
Ok(None) => {
254+
// This case should ideally not happen if the input `paths` was not empty,
255+
// but handle it defensively.
256+
eprintln!("[WARN] No files found for basename '{}' during size comparison.", basename);
257+
}
258+
Err(e) => {
259+
eprintln!("[WARN] Error getting metadata for basename '{}', skipping: {}", basename, e);
260+
// Decide if you want to skip the whole group or handle differently
261+
}
262+
}
263+
}
264+
}
265+
266+
eprintln!("[DEBUG] Filtered down to {} files to process.", paths_to_process.len());
267+
268+
269+
// --- Process the filtered list of files ---
270+
for path in paths_to_process {
271+
// Calculate path relative to the docs_path root
272+
let relative_path = match path.strip_prefix(&docs_path) {
273+
Ok(p) => p.to_path_buf(),
274+
Err(e) => {
275+
eprintln!("[WARN] Failed to strip prefix {} from {}: {}", docs_path.display(), path.display(), e);
276+
continue; // Skip if path manipulation fails
277+
}
278+
};
279+
let path_str = relative_path.to_string_lossy().to_string();
204280

205-
// eprintln!(" Reading file content..."); // Added
206-
let html_content = fs::read_to_string(path)?; // Still read from the absolute path
207-
// eprintln!(" Parsing HTML..."); // Added
281+
let html_content = match fs::read_to_string(&path) { // Read from the absolute path
282+
Ok(content) => content,
283+
Err(e) => {
284+
eprintln!("[WARN] Failed to read file {}: {}", path.display(), e);
285+
continue; // Skip this file if reading fails
286+
}
287+
};
208288

209-
// Parse the HTML document
210289
let document = Html::parse_document(&html_content);
211290

212-
// Select the main content element
213291
if let Some(main_content_element) = document.select(&content_selector).next() {
214-
// Extract all text nodes within the main content
215-
// eprintln!(" Extracting text content..."); // Added
216292
let text_content: String = main_content_element
217293
.text()
218294
.map(|s| s.trim())
219295
.filter(|s| !s.is_empty())
220296
.collect::<Vec<&str>>()
221-
.join("\n"); // Join text nodes with newlines
297+
.join("\n");
222298

223299
if !text_content.is_empty() {
224-
// eprintln!(" Extracted content ({} chars)", text_content.len()); // Uncommented and simplified
225300
documents.push(Document {
226301
path: path_str,
227302
content: text_content,
228303
});
229304
} else {
230-
// eprintln!("No text content found in main section for: {}", path.display()); // Verbose logging
305+
// eprintln!("[DEBUG] No text content found in main section for: {}", path.display());
231306
}
232307
} else {
233-
// eprintln!("'main-content' selector not found for: {}", path.display()); // Verbose logging
234-
// Optionally handle files without the main content selector differently
308+
// eprintln!("[DEBUG] 'main-content' selector not found for: {}", path.display());
235309
}
236310
}
237311

238-
eprintln!("Finished document loading. Found {} documents.", documents.len());
312+
eprintln!("Finished document loading. Found {} final documents.", documents.len());
239313
Ok(documents)
240314
}

0 commit comments

Comments
 (0)