@@ -611,65 +611,86 @@ def copy_file(
611
611
with open (dest_file , "w" ) as f :
612
612
f .write (content )
613
613
614
+ # 7/26/24:
615
+ # Corrected the scrub_file function.
616
+ # Local files and URLs are now appropriately addressed individually.
617
+ # URL includes of files within the openshift-docs repo now generate an error.
618
+ # Files outside the `openshift-docs` repo are downloaded directly with error handling for connection issues.
614
619
615
620
def scrub_file (info , book_src_dir , src_file , tag = None , cwd = None ):
616
621
"""
617
622
Scrubs a file and returns the cleaned file contents.
618
623
"""
619
624
base_src_file = src_file .replace (info ["src_dir" ] + "/" , "" )
620
625
621
- # added 1/Sep/2020
622
- # to allow loading files like json and yaml from external sources, this
623
- # procedure loads the file recognizing that it starts with http
624
- # it then checks if it exists or not, and if it exists, returns the raw data
625
- # data that it finds.
626
- if base_src_file .startswith ("https://raw.githubusercontent.com/openshift/" ):
626
+ if base_src_file .startswith ("https://raw.githubusercontent.com/" ):
627
+ # Disallow URL inclusion from openshift-docs
628
+ if "openshift-docs" in base_src_file :
629
+ log .error (
630
+ "Inclusion of files within the openshift-docs repository by URL is not supported: %s" ,
631
+ base_src_file ,
632
+ )
633
+ list_of_errors .append (
634
+ f"Inclusion of files within the openshift-docs repository by URL is not supported: { base_src_file } "
635
+ )
636
+ return "" # Skip processing this file
637
+
638
+ # Allow includes only from specific organizations
639
+ if not any (org in base_src_file for org in ["openshift/" , "redhatofficial/" ]):
640
+ log .error (
641
+ "Inclusion of files from unauthorized repositories by URL is not supported: %s" ,
642
+ base_src_file ,
643
+ )
644
+ list_of_errors .append (
645
+ f"Inclusion of files from unauthorized repositories by URL is not supported: { base_src_file } "
646
+ )
647
+ return "" # Skip processing this file
648
+
627
649
try :
628
650
response = requests .get (base_src_file )
629
- if response :
630
- return response .text
651
+ if response .status_code == 200 :
652
+ # Only apply scrubbing if the external file is AsciiDoc
653
+ if base_src_file .endswith (".adoc" ):
654
+ return scrub_content (response .text , info , book_src_dir , src_file , tag , cwd )
655
+ else :
656
+ return response .text
631
657
else :
632
- raise ConnectionError ("Malformed URL" )
633
- except Exception as exception :
634
- log .error ("An include file wasn't found: %s" , base_src_file )
635
- list_of_errors .append (f"An include file wasn't found: { base_src_file } " )
636
- sys .exit (- 1 )
637
-
638
- # Get a list of predefined custom title ids for the file
639
- title_ids = TITLE_IDS .get (base_src_file , {})
658
+ raise ConnectionError (f"Failed to download file from { base_src_file } " )
659
+ except Exception as e :
660
+ log .error (f"Error fetching external include: { base_src_file } - { e } " )
661
+ list_of_errors .append (f"Error fetching external include: { base_src_file } " )
662
+ return "" # Skip processing this file
640
663
641
- # Read in the source content
664
+ # Local file processing
642
665
with open (src_file , "r" ) as f :
643
666
src_file_content = f .readlines ()
644
667
645
- # Scrub the content
646
- content = ""
668
+ return scrub_content ("" .join (src_file_content ), info , book_src_dir , src_file , tag , cwd )
669
+
670
+ def scrub_content (content , info , book_src_dir , src_file , tag = None , cwd = None ):
671
+ base_src_file = src_file .replace (info ["src_dir" ] + "/" , "" )
672
+ title_ids = TITLE_IDS .get (base_src_file , {})
647
673
header_found = content_found = False
648
674
current_id = None
649
- for line in src_file_content :
650
- # Ignore any leading blank lines, before any meaningful content is found
675
+ scrubbed_content = ""
676
+
677
+ for line in content .splitlines (True ):
651
678
if line .strip () == "" and not content_found :
652
679
continue
653
680
654
- # Check if the line should be included in the output
655
681
if include_line (line ):
656
682
content_found = True
657
-
658
- # Setup the document header content/id
659
683
if not header_found and line .strip () != "" and line .startswith ("=" ):
660
684
header_found = True
661
-
662
685
if (
663
686
info ["all_in_one" ]
664
687
and base_src_file in ALL_IN_ONE_SCRAP_TITLE
665
688
and line .startswith ("= " )
666
689
):
667
690
continue
668
- # Add a section id if one doesn't exist, so we have something to link to
669
691
elif current_id is None and src_file in info ["file_to_id_map" ]:
670
692
file_id = info ["file_to_id_map" ][src_file ]
671
- content += "[[" + file_id + "]]\n "
672
- # Add a custom title id, if one is needed
693
+ scrubbed_content += "[[" + file_id + "]]\n "
673
694
elif line .startswith ("=" ) and current_id is None :
674
695
for title in title_ids :
675
696
title_re = (
@@ -678,32 +699,26 @@ def scrub_file(info, book_src_dir, src_file, tag=None, cwd=None):
678
699
+ "( (anchor|\[).*?)?(\n )?$"
679
700
)
680
701
if re .match (title_re , line ):
681
- content += "[[" + title_ids [title ] + "]]\n "
702
+ scrubbed_content += "[[" + title_ids [title ] + "]]\n "
682
703
683
- # Set the current id based on the line content
684
704
if current_id is None and ID_RE .match (line .strip ()):
685
705
current_id = line .strip ()
686
706
elif current_id is not None and line .strip != "" :
687
707
current_id = None
688
708
689
- # Add the line to the processed content
690
- content += line
709
+ scrubbed_content += line
691
710
692
- # Fix up any duplicate ids
693
711
if base_src_file in DUPLICATE_IDS :
694
712
for duplicate_id , new_id in list (DUPLICATE_IDS [base_src_file ].items ()):
695
- content = content .replace ("[[" + duplicate_id + "]]" , "[[" + new_id + "]]" )
713
+ scrubbed_content = scrubbed_content .replace ("[[" + duplicate_id + "]]" , "[[" + new_id + "]]" )
696
714
697
- # Replace incorrect links with correct ones
698
715
if base_src_file in INCORRECT_LINKS :
699
716
for incorrect_link , fixed_link in list (INCORRECT_LINKS [base_src_file ].items ()):
700
- content = content .replace (incorrect_link , fixed_link )
717
+ scrubbed_content = scrubbed_content .replace (incorrect_link , fixed_link )
701
718
702
- # Fix up the links
703
- content = fix_links (content , info , book_src_dir , src_file , tag = tag , cwd = cwd )
704
-
705
- return content
719
+ scrubbed_content = fix_links (scrubbed_content , info , book_src_dir , src_file , tag = tag , cwd = cwd )
706
720
721
+ return scrubbed_content
707
722
708
723
def include_line (line ):
709
724
"""
0 commit comments