1
1
use scraper:: { Html , Selector } ;
2
- use std:: { fs:: { self , File , create_dir_all} , io:: Write , path:: PathBuf } ; // Added PathBuf
2
+ use std:: { collections :: HashMap , fs:: { self , File , create_dir_all} , io:: Write , path:: PathBuf } ; // Added PathBuf, HashMap
3
3
use cargo:: core:: resolver:: features:: CliFeatures ;
4
4
// use cargo::core::SourceId; // Removed unused import
5
5
// use cargo::util::Filesystem; // Removed unused import
@@ -25,8 +25,7 @@ pub enum DocLoaderError {
25
25
TempDirCreationFailed ( std:: io:: Error ) ,
26
26
#[ error( "Cargo library error: {0}" ) ]
27
27
CargoLib ( #[ from] AnyhowError ) , // Re-add CargoLib variant
28
- #[ error( "Failed to strip prefix '{prefix}' from path '{path}': {source}" ) ] // Improved error
29
- StripPrefix { prefix : PathBuf , path : PathBuf , source : std:: path:: StripPrefixError } ,
28
+ // Removed unused StripPrefix variant
30
29
}
31
30
32
31
// Simple struct to hold document content, maybe add path later if needed
@@ -36,6 +35,7 @@ pub struct Document {
36
35
pub content : String ,
37
36
}
38
37
38
+
39
39
/// Generates documentation for a given crate in a temporary directory,
40
40
/// then loads and parses the HTML documents.
41
41
/// Extracts text content from the main content area of rustdoc generated HTML.
@@ -44,23 +44,12 @@ pub fn load_documents(
44
44
crate_version_req : & str ,
45
45
features : Option < & Vec < String > > , // Add optional features parameter
46
46
) -> Result < Vec < Document > , DocLoaderError > {
47
- eprintln ! (
48
- "[DEBUG] load_documents called with crate_name: '{}', crate_version_req: '{}', features: {:?}" ,
49
- crate_name, crate_version_req, features
50
- ) ;
51
47
let mut documents = Vec :: new ( ) ;
52
48
53
49
let temp_dir = tempdir ( ) . map_err ( DocLoaderError :: TempDirCreationFailed ) ?;
54
50
let temp_dir_path = temp_dir. path ( ) ;
55
51
let temp_manifest_path = temp_dir_path. join ( "Cargo.toml" ) ;
56
52
57
- eprintln ! (
58
- "Generating documentation for crate '{}' (Version Req: '{}', Features: {:?}) in temporary directory: {}" ,
59
- crate_name,
60
- crate_version_req,
61
- features, // Log features
62
- temp_dir_path. display( )
63
- ) ;
64
53
65
54
// Create a temporary Cargo.toml using the version requirement and features
66
55
let features_string = features
@@ -159,7 +148,6 @@ edition = "2021"
159
148
160
149
let docs_path = match ( found_count, target_docs_path) {
161
150
( 1 , Some ( path) ) => {
162
- eprintln ! ( "[DEBUG] Confirmed unique documentation directory: {}" , path. display( ) ) ;
163
151
path
164
152
} ,
165
153
( 0 , _) => {
@@ -184,57 +172,143 @@ edition = "2021"
184
172
let content_selector = Selector :: parse ( "section#main-content.content" )
185
173
. map_err ( |e| DocLoaderError :: Selector ( e. to_string ( ) ) ) ?;
186
174
187
- for entry in WalkDir :: new ( & docs_path)
175
+ // --- Collect all HTML file paths first ---
176
+ let all_html_paths: Vec < PathBuf > = WalkDir :: new ( & docs_path)
188
177
. into_iter ( )
189
- . filter_map ( Result :: ok) // Ignore errors during iteration for now
190
- . filter ( |e| !e. file_type ( ) . is_dir ( ) && e. path ( ) . extension ( ) . is_some_and ( |ext| ext == "html" ) )
191
- {
192
- let path = entry. path ( ) ;
193
- // Calculate path relative to the docs_path root
194
- let relative_path = path. strip_prefix ( & docs_path) . map_err ( |e| {
195
- // Provide more context in the error message using the new error variant
196
- DocLoaderError :: StripPrefix {
197
- prefix : docs_path. to_path_buf ( ) ,
198
- path : path. to_path_buf ( ) ,
199
- source : e,
178
+ . filter_map ( Result :: ok) // Ignore errors during iteration
179
+ . filter ( |e| {
180
+ !e. file_type ( ) . is_dir ( ) && e. path ( ) . extension ( ) . is_some_and ( |ext| ext == "html" )
181
+ } )
182
+ . map ( |e| e. into_path ( ) ) // Get the PathBuf
183
+ . collect ( ) ;
184
+
185
+ eprintln ! ( "[DEBUG] Found {} total HTML files initially." , all_html_paths. len( ) ) ;
186
+
187
+ // --- Group files by basename ---
188
+ let mut basename_groups: HashMap < String , Vec < PathBuf > > = HashMap :: new ( ) ;
189
+ for path in all_html_paths {
190
+ if let Some ( filename_osstr) = path. file_name ( ) {
191
+ if let Some ( filename_str) = filename_osstr. to_str ( ) {
192
+ basename_groups
193
+ . entry ( filename_str. to_string ( ) )
194
+ . or_default ( )
195
+ . push ( path) ;
196
+ } else {
197
+ eprintln ! ( "[WARN] Skipping file with non-UTF8 name: {}" , path. display( ) ) ;
200
198
}
201
- } ) ?;
202
- let path_str = relative_path. to_string_lossy ( ) . to_string ( ) ; // Use the relative path
203
- // eprintln!("Processing file: {} (relative: {})", path.display(), path_str); // Updated debug log
199
+ } else {
200
+ eprintln ! ( "[WARN] Skipping file with no name: {}" , path. display( ) ) ;
201
+ }
202
+ }
203
+
204
+ // --- Initialize paths_to_process and explicitly add the root index.html if it exists ---
205
+ let mut paths_to_process: Vec < PathBuf > = Vec :: new ( ) ;
206
+ let root_index_path = docs_path. join ( "index.html" ) ;
207
+ if root_index_path. is_file ( ) {
208
+ paths_to_process. push ( root_index_path) ;
209
+ }
210
+
211
+ // --- Filter based on duplicates and size ---
212
+ // NOTE: Initialization of paths_to_process moved before this loop
213
+ for ( basename, mut paths) in basename_groups {
214
+ // Always ignore index.html at this stage (except the root one added earlier)
215
+ if basename == "index.html" {
216
+ continue ;
217
+ }
218
+
219
+ // Also ignore files within source code view directories
220
+ // Check the first path (they should share the problematic component if any)
221
+ if paths. first ( ) . map_or ( false , |p| p. components ( ) . any ( |comp| comp. as_os_str ( ) == "src" ) ) {
222
+ continue ;
223
+ }
224
+
225
+
226
+ if paths. len ( ) == 1 {
227
+ // Single file with this basename (and not index.html), keep it
228
+ paths_to_process. push ( paths. remove ( 0 ) ) ;
229
+ } else {
230
+ // Multiple files with the same basename (duplicates)
231
+ // Find the largest one by file size
232
+ // Explicit type annotation needed for the error type in try_fold
233
+ let largest_path_result: Result < Option < ( PathBuf , u64 ) > , std:: io:: Error > = paths. into_iter ( ) . try_fold ( None :: < ( PathBuf , u64 ) > , |largest, current| {
234
+ let current_meta = fs:: metadata ( & current) ?;
235
+ let current_size = current_meta. len ( ) ;
236
+ match largest {
237
+ None => Ok ( Some ( ( current, current_size) ) ) ,
238
+ Some ( ( largest_path_so_far, largest_size_so_far) ) => {
239
+ if current_size > largest_size_so_far {
240
+ Ok ( Some ( ( current, current_size) ) )
241
+ } else {
242
+ Ok ( Some ( ( largest_path_so_far, largest_size_so_far) ) )
243
+ }
244
+ }
245
+ }
246
+ } ) ;
247
+
248
+ match largest_path_result {
249
+ Ok ( Some ( ( p, _size) ) ) => {
250
+ // eprintln!("[DEBUG] Duplicate basename '{}': Keeping largest file {}", basename, p.display());
251
+ paths_to_process. push ( p) ;
252
+ }
253
+ Ok ( None ) => {
254
+ // This case should ideally not happen if the input `paths` was not empty,
255
+ // but handle it defensively.
256
+ eprintln ! ( "[WARN] No files found for basename '{}' during size comparison." , basename) ;
257
+ }
258
+ Err ( e) => {
259
+ eprintln ! ( "[WARN] Error getting metadata for basename '{}', skipping: {}" , basename, e) ;
260
+ // Decide if you want to skip the whole group or handle differently
261
+ }
262
+ }
263
+ }
264
+ }
265
+
266
+ eprintln ! ( "[DEBUG] Filtered down to {} files to process." , paths_to_process. len( ) ) ;
267
+
268
+
269
+ // --- Process the filtered list of files ---
270
+ for path in paths_to_process {
271
+ // Calculate path relative to the docs_path root
272
+ let relative_path = match path. strip_prefix ( & docs_path) {
273
+ Ok ( p) => p. to_path_buf ( ) ,
274
+ Err ( e) => {
275
+ eprintln ! ( "[WARN] Failed to strip prefix {} from {}: {}" , docs_path. display( ) , path. display( ) , e) ;
276
+ continue ; // Skip if path manipulation fails
277
+ }
278
+ } ;
279
+ let path_str = relative_path. to_string_lossy ( ) . to_string ( ) ;
204
280
205
- // eprintln!(" Reading file content..."); // Added
206
- let html_content = fs:: read_to_string ( path) ?; // Still read from the absolute path
207
- // eprintln!(" Parsing HTML..."); // Added
281
+ let html_content = match fs:: read_to_string ( & path) { // Read from the absolute path
282
+ Ok ( content) => content,
283
+ Err ( e) => {
284
+ eprintln ! ( "[WARN] Failed to read file {}: {}" , path. display( ) , e) ;
285
+ continue ; // Skip this file if reading fails
286
+ }
287
+ } ;
208
288
209
- // Parse the HTML document
210
289
let document = Html :: parse_document ( & html_content) ;
211
290
212
- // Select the main content element
213
291
if let Some ( main_content_element) = document. select ( & content_selector) . next ( ) {
214
- // Extract all text nodes within the main content
215
- // eprintln!(" Extracting text content..."); // Added
216
292
let text_content: String = main_content_element
217
293
. text ( )
218
294
. map ( |s| s. trim ( ) )
219
295
. filter ( |s| !s. is_empty ( ) )
220
296
. collect :: < Vec < & str > > ( )
221
- . join ( "\n " ) ; // Join text nodes with newlines
297
+ . join ( "\n " ) ;
222
298
223
299
if !text_content. is_empty ( ) {
224
- // eprintln!(" Extracted content ({} chars)", text_content.len()); // Uncommented and simplified
225
300
documents. push ( Document {
226
301
path : path_str,
227
302
content : text_content,
228
303
} ) ;
229
304
} else {
230
- // eprintln!("No text content found in main section for: {}", path.display()); // Verbose logging
305
+ // eprintln!("[DEBUG] No text content found in main section for: {}", path.display());
231
306
}
232
307
} else {
233
- // eprintln!("'main-content' selector not found for: {}", path.display()); // Verbose logging
234
- // Optionally handle files without the main content selector differently
308
+ // eprintln!("[DEBUG] 'main-content' selector not found for: {}", path.display());
235
309
}
236
310
}
237
311
238
- eprintln ! ( "Finished document loading. Found {} documents." , documents. len( ) ) ;
312
+ eprintln ! ( "Finished document loading. Found {} final documents." , documents. len( ) ) ;
239
313
Ok ( documents)
240
314
}
0 commit comments