Skip to content

Commit ef0546a

Browse files
committed
Updated scrub_file function for handling includes of external files.
1 parent 69be0e6 commit ef0546a

File tree

1 file changed

+55
-40
lines changed

1 file changed

+55
-40
lines changed

build_for_portal.py

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -611,65 +611,86 @@ def copy_file(
611611
with open(dest_file, "w") as f:
612612
f.write(content)
613613

614+
# 7/26/24:
615+
# Corrected the scrub_file function.
616+
# Local files and URLs are now appropriately addressed individually.
617+
# URL includes of files within the openshift-docs repo now generate an error.
618+
# Files outside the `openshift-docs` repo are downloaded directly with error handling for connection issues.
614619

615620
def scrub_file(info, book_src_dir, src_file, tag=None, cwd=None):
616621
"""
617622
Scrubs a file and returns the cleaned file contents.
618623
"""
619624
base_src_file = src_file.replace(info["src_dir"] + "/", "")
620625

621-
# added 1/Sep/2020
622-
# to allow loading files like json and yaml from external sources, this
623-
# procedure loads the file recognizing that it starts with http
624-
# it then checks if it exists or not, and if it exists, returns the raw data
625-
# data that it finds.
626-
if base_src_file.startswith("https://raw.githubusercontent.com/openshift/"):
626+
if base_src_file.startswith("https://raw.githubusercontent.com/"):
627+
# Disallow URL inclusion from openshift-docs
628+
if "openshift-docs" in base_src_file:
629+
log.error(
630+
"Inclusion of files within the openshift-docs repository by URL is not supported: %s",
631+
base_src_file,
632+
)
633+
list_of_errors.append(
634+
f"Inclusion of files within the openshift-docs repository by URL is not supported: {base_src_file}"
635+
)
636+
return "" # Skip processing this file
637+
638+
# Allow includes only from specific organizations
639+
if not any(org in base_src_file for org in ["openshift/", "redhatofficial/"]):
640+
log.error(
641+
"Inclusion of files from unauthorized repositories by URL is not supported: %s",
642+
base_src_file,
643+
)
644+
list_of_errors.append(
645+
f"Inclusion of files from unauthorized repositories by URL is not supported: {base_src_file}"
646+
)
647+
return "" # Skip processing this file
648+
627649
try:
628650
response = requests.get(base_src_file)
629-
if response:
630-
return response.text
651+
if response.status_code == 200:
652+
# Only apply scrubbing if the external file is AsciiDoc
653+
if base_src_file.endswith(".adoc"):
654+
return scrub_content(response.text, info, book_src_dir, src_file, tag, cwd)
655+
else:
656+
return response.text
631657
else:
632-
raise ConnectionError("Malformed URL")
633-
except Exception as exception:
634-
log.error("An include file wasn't found: %s", base_src_file)
635-
list_of_errors.append(f"An include file wasn't found: {base_src_file}")
636-
sys.exit(-1)
637-
638-
# Get a list of predefined custom title ids for the file
639-
title_ids = TITLE_IDS.get(base_src_file, {})
658+
raise ConnectionError(f"Failed to download file from {base_src_file}")
659+
except Exception as e:
660+
log.error(f"Error fetching external include: {base_src_file} - {e}")
661+
list_of_errors.append(f"Error fetching external include: {base_src_file}")
662+
return "" # Skip processing this file
640663

641-
# Read in the source content
664+
# Local file processing
642665
with open(src_file, "r") as f:
643666
src_file_content = f.readlines()
644667

645-
# Scrub the content
646-
content = ""
668+
return scrub_content("".join(src_file_content), info, book_src_dir, src_file, tag, cwd)
669+
670+
def scrub_content(content, info, book_src_dir, src_file, tag=None, cwd=None):
671+
base_src_file = src_file.replace(info["src_dir"] + "/", "")
672+
title_ids = TITLE_IDS.get(base_src_file, {})
647673
header_found = content_found = False
648674
current_id = None
649-
for line in src_file_content:
650-
# Ignore any leading blank lines, before any meaningful content is found
675+
scrubbed_content = ""
676+
677+
for line in content.splitlines(True):
651678
if line.strip() == "" and not content_found:
652679
continue
653680

654-
# Check if the line should be included in the output
655681
if include_line(line):
656682
content_found = True
657-
658-
# Setup the document header content/id
659683
if not header_found and line.strip() != "" and line.startswith("="):
660684
header_found = True
661-
662685
if (
663686
info["all_in_one"]
664687
and base_src_file in ALL_IN_ONE_SCRAP_TITLE
665688
and line.startswith("= ")
666689
):
667690
continue
668-
# Add a section id if one doesn't exist, so we have something to link to
669691
elif current_id is None and src_file in info["file_to_id_map"]:
670692
file_id = info["file_to_id_map"][src_file]
671-
content += "[[" + file_id + "]]\n"
672-
# Add a custom title id, if one is needed
693+
scrubbed_content += "[[" + file_id + "]]\n"
673694
elif line.startswith("=") and current_id is None:
674695
for title in title_ids:
675696
title_re = (
@@ -678,32 +699,26 @@ def scrub_file(info, book_src_dir, src_file, tag=None, cwd=None):
678699
+ "( (anchor|\[).*?)?(\n)?$"
679700
)
680701
if re.match(title_re, line):
681-
content += "[[" + title_ids[title] + "]]\n"
702+
scrubbed_content += "[[" + title_ids[title] + "]]\n"
682703

683-
# Set the current id based on the line content
684704
if current_id is None and ID_RE.match(line.strip()):
685705
current_id = line.strip()
686706
elif current_id is not None and line.strip != "":
687707
current_id = None
688708

689-
# Add the line to the processed content
690-
content += line
709+
scrubbed_content += line
691710

692-
# Fix up any duplicate ids
693711
if base_src_file in DUPLICATE_IDS:
694712
for duplicate_id, new_id in list(DUPLICATE_IDS[base_src_file].items()):
695-
content = content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")
713+
scrubbed_content = scrubbed_content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")
696714

697-
# Replace incorrect links with correct ones
698715
if base_src_file in INCORRECT_LINKS:
699716
for incorrect_link, fixed_link in list(INCORRECT_LINKS[base_src_file].items()):
700-
content = content.replace(incorrect_link, fixed_link)
717+
scrubbed_content = scrubbed_content.replace(incorrect_link, fixed_link)
701718

702-
# Fix up the links
703-
content = fix_links(content, info, book_src_dir, src_file, tag=tag, cwd=cwd)
704-
705-
return content
719+
scrubbed_content = fix_links(scrubbed_content, info, book_src_dir, src_file, tag=tag, cwd=cwd)
706720

721+
return scrubbed_content
707722

708723
def include_line(line):
709724
"""

0 commit comments

Comments
 (0)