Updated scrub_file function for handling includes of external files.

libander · libander · commit ef0546a97f07 · 2024-07-26T18:52:35.000-05:00
diff --git a/build_for_portal.py b/build_for_portal.py
@@ -611,65 +611,86 @@ def copy_file(
     with open(dest_file, "w") as f:
         f.write(content)
 
+# 7/26/24:
+# Corrected the scrub_file function.
+# Local files and URLs are now appropriately addressed individually.
+# URL includes of files within the openshift-docs repo now generate an error.
+# Files outside the `openshift-docs` repo are downloaded directly with error handling for connection issues.
 
 def scrub_file(info, book_src_dir, src_file, tag=None, cwd=None):
     """
     Scrubs a file and returns the cleaned file contents.
     """
     base_src_file = src_file.replace(info["src_dir"] + "/", "")
 
-    # added 1/Sep/2020
-    # to allow loading files like json and yaml from external sources, this
-    # procedure loads the file recognizing that it starts with http
-    # it then checks if it exists or not, and if it exists, returns the raw data
-    # data that it finds.
-    if base_src_file.startswith("https://raw.githubusercontent.com/openshift/"):
+    if base_src_file.startswith("https://raw.githubusercontent.com/"):
+        # Disallow URL inclusion from openshift-docs
+        if "openshift-docs" in base_src_file:
+            log.error(
+                "Inclusion of files within the openshift-docs repository by URL is not supported: %s",
+                base_src_file,
+            )
+            list_of_errors.append(
+                f"Inclusion of files within the openshift-docs repository by URL is not supported: {base_src_file}"
+            )
+            return ""  # Skip processing this file
+
+        # Allow includes only from specific organizations
+        if not any(org in base_src_file for org in ["openshift/", "redhatofficial/"]):
+            log.error(
+                "Inclusion of files from unauthorized repositories by URL is not supported: %s",
+                base_src_file,
+            )
+            list_of_errors.append(
+                f"Inclusion of files from unauthorized repositories by URL is not supported: {base_src_file}"
+            )
+            return ""  # Skip processing this file
+
         try:
             response = requests.get(base_src_file)
-            if response:
-                return response.text
+            if response.status_code == 200:
+                # Only apply scrubbing if the external file is AsciiDoc
+                if base_src_file.endswith(".adoc"):
+                    return scrub_content(response.text, info, book_src_dir, src_file, tag, cwd)
+                else:
+                    return response.text
             else:
-                raise ConnectionError("Malformed URL")
-        except Exception as exception:
-            log.error("An include file wasn't found: %s", base_src_file)
-            list_of_errors.append(f"An include file wasn't found: {base_src_file}")
-            sys.exit(-1)
-
-    # Get a list of predefined custom title ids for the file
-    title_ids = TITLE_IDS.get(base_src_file, {})
+                raise ConnectionError(f"Failed to download file from {base_src_file}")
+        except Exception as e:
+            log.error(f"Error fetching external include: {base_src_file} - {e}")
+            list_of_errors.append(f"Error fetching external include: {base_src_file}")
+            return ""  # Skip processing this file
 
-    # Read in the source content
+    # Local file processing
     with open(src_file, "r") as f:
         src_file_content = f.readlines()
 
-    # Scrub the content
-    content = ""
+    return scrub_content("".join(src_file_content), info, book_src_dir, src_file, tag, cwd)
+
+def scrub_content(content, info, book_src_dir, src_file, tag=None, cwd=None):
+    base_src_file = src_file.replace(info["src_dir"] + "/", "")
+    title_ids = TITLE_IDS.get(base_src_file, {})
     header_found = content_found = False
     current_id = None
-    for line in src_file_content:
-        # Ignore any leading blank lines, before any meaningful content is found
+    scrubbed_content = ""
+
+    for line in content.splitlines(True):
         if line.strip() == "" and not content_found:
             continue
 
-        # Check if the line should be included in the output
         if include_line(line):
             content_found = True
-
-            # Setup the document header content/id
             if not header_found and line.strip() != "" and line.startswith("="):
                 header_found = True
-
                 if (
                     info["all_in_one"]
                     and base_src_file in ALL_IN_ONE_SCRAP_TITLE
                     and line.startswith("= ")
                 ):
                     continue
-                # Add a section id if one doesn't exist, so we have something to link to
                 elif current_id is None and src_file in info["file_to_id_map"]:
                     file_id = info["file_to_id_map"][src_file]
-                    content += "[[" + file_id + "]]\n"
-            # Add a custom title id, if one is needed
+                    scrubbed_content += "[[" + file_id + "]]\n"
             elif line.startswith("=") and current_id is None:
                 for title in title_ids:
                     title_re = (
@@ -678,32 +699,26 @@ def scrub_file(info, book_src_dir, src_file, tag=None, cwd=None):
                         + "( (anchor|\[).*?)?(\n)?$"
                     )
                     if re.match(title_re, line):
-                        content += "[[" + title_ids[title] + "]]\n"
+                        scrubbed_content += "[[" + title_ids[title] + "]]\n"
 
-            # Set the current id based on the line content
             if current_id is None and ID_RE.match(line.strip()):
                 current_id = line.strip()
             elif current_id is not None and line.strip != "":
                 current_id = None
 
-            # Add the line to the processed content
-            content += line
+            scrubbed_content += line
 
-    # Fix up any duplicate ids
     if base_src_file in DUPLICATE_IDS:
         for duplicate_id, new_id in list(DUPLICATE_IDS[base_src_file].items()):
-            content = content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")
+            scrubbed_content = scrubbed_content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")
 
-    # Replace incorrect links with correct ones
     if base_src_file in INCORRECT_LINKS:
         for incorrect_link, fixed_link in list(INCORRECT_LINKS[base_src_file].items()):
-            content = content.replace(incorrect_link, fixed_link)
+            scrubbed_content = scrubbed_content.replace(incorrect_link, fixed_link)
 
-    # Fix up the links
-    content = fix_links(content, info, book_src_dir, src_file, tag=tag, cwd=cwd)
-
-    return content
+    scrubbed_content = fix_links(scrubbed_content, info, book_src_dir, src_file, tag=tag, cwd=cwd)
 
+    return scrubbed_content
 
 def include_line(line):
     """