Skip to content

Commit f8dbd24

Browse files
authored
fix and catch unintended uses of inline HTML (#1716)
* fix missing text in "KMS Provider" section of the Client Side Encryption spec due to less-than symbol * manual audit of all less-than symbols followed by a letter * reformat python scripts using "black" * let scripts detect fenced code inside block-quotes * add a pre-commit script to check HTML tags against allowed patterns * replace HTTP links with equivalent HTTPS
1 parent 21c1427 commit f8dbd24

File tree

13 files changed

+190
-115
lines changed

13 files changed

+190
-115
lines changed

.pre-commit-config.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ repos:
4545
types: [markdown]
4646
language: system
4747
entry: python3 scripts/check_links.py
48+
- id: markdown-html-check
49+
name: markdown-html-check
50+
types: [markdown]
51+
language: system
52+
entry: python3 scripts/check_md_html.py
4853

4954
- repo: https://github.com/tcort/markdown-link-check
5055
rev: v3.12.2

scripts/check_links.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,29 @@
1-
import sys
1+
import sys, re
2+
23
fname = sys.argv[-1]
34

5+
# Roughly detect fenced code even inside block quotes
6+
fenced_code = re.compile(r"^\s*(>\s+)*```")
7+
48
# Check for markdown links that got improperly line wrapped.
59
in_code_block = False
610
with open(fname) as fid:
711
for line in fid:
812
# Ignore code blocks.
9-
if line.strip().startswith('```'):
13+
if fenced_code.match(line):
1014
in_code_block = not in_code_block
1115
if in_code_block:
1216
continue
13-
id0 = line.index('[') if '[' in line else -1
14-
id1 = line.index(']') if ']' in line else -1
15-
id2 = line.index('(') if '(' in line else -1
16-
id3 = line.index(')') if ')' in line else -1
17+
id0 = line.index("[") if "[" in line else -1
18+
id1 = line.index("]") if "]" in line else -1
19+
id2 = line.index("(") if "(" in line else -1
20+
id3 = line.index(")") if ")" in line else -1
1721
if id1 == -1 or id2 == -1 or id3 == -1:
1822
continue
1923
if id2 < id1 or id3 < id2:
2024
continue
2125
if id0 == -1:
22-
print('*** Malformed link in line:', line, fname)
23-
sys.exit(1)
26+
print("*** Malformed link in line:", line, fname)
27+
sys.exit(1)
28+
29+
assert not in_code_block

scripts/check_md_html.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import sys, re
2+
3+
fname = sys.argv[-1]
4+
5+
# Check for allowed HTML elements in markdown.
6+
# Ignores inline and fenced code, but intentionally doesn't ignore backslash
7+
# escaping. (For compatibility, we want to avoid unintentional inline HTML
8+
# even on markdown implementations where "\<" escapes are not supported.)
9+
10+
disallowed_re = re.compile(
11+
r"""
12+
[^`]*(`[^`]+`)*
13+
<(?!
14+
- |
15+
/p> |
16+
/span> |
17+
/sub> |
18+
/sup> |
19+
/table> |
20+
/td> |
21+
/tr> |
22+
\d |
23+
\s |
24+
\w+@(\w+\.)+\w+> | # Cover email addresses in license files
25+
= |
26+
br> |
27+
https:// | # Cover HTTPS links but not HTTP
28+
p> |
29+
span[\s>] |
30+
sub> |
31+
sup> |
32+
table[\s>] |
33+
td[\s>] |
34+
tr> |
35+
!-- )
36+
""",
37+
re.VERBOSE,
38+
)
39+
40+
# Roughly detect fenced code even inside block quotes
41+
fenced_code = re.compile(r"^\s*(>\s+)*```")
42+
43+
in_code_block = False
44+
with open(fname) as fid:
45+
for line in fid:
46+
# Ignore code blocks.
47+
if fenced_code.match(line):
48+
in_code_block = not in_code_block
49+
if in_code_block:
50+
continue
51+
if disallowed_re.match(line):
52+
print("*** Markdown contains unexpected HTML in line:", line, fname)
53+
sys.exit(1)
54+
55+
assert not in_code_block

scripts/generate_index.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
from pathlib import Path
3+
34
source = Path(__file__).resolve().parent.parent / "source"
45
source = source.resolve()
56
info = {}
@@ -9,21 +10,21 @@
910
continue
1011
if "node_modules" in relpath:
1112
continue
12-
if p.name in ['index.md']:
13+
if p.name in ["index.md"]:
1314
continue
14-
fpath = relpath + '/' + p.name
15+
fpath = relpath + "/" + p.name
1516
name = None
1617
with p.open() as fid:
1718
for line in fid:
1819
if line.startswith("# "):
19-
name = line.replace('# ', '').strip()
20+
name = line.replace("# ", "").strip()
2021
break
2122
if name is None:
22-
raise ValueError(f'Could not find name for {fpath}')
23+
raise ValueError(f"Could not find name for {fpath}")
2324
info[name] = fpath
2425

2526
index_file = source / "index.md"
2627
with index_file.open("w") as fid:
27-
fid.write('# MongoDB Specifications\n\n')
28+
fid.write("# MongoDB Specifications\n\n")
2829
for name in sorted(info):
29-
fid.write(f'- [{name}]({info[name]})\n')
30+
fid.write(f"- [{name}]({info[name]})\n")

scripts/migrate_to_md.py

+65-57
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,20 @@
77
import subprocess
88

99
if len(sys.argv) < 2:
10-
print('Must provide a path to an RST file')
10+
print("Must provide a path to an RST file")
1111
sys.exit(1)
1212

1313
path = Path(sys.argv[1])
1414

1515
# Ensure git history for the md file.
16-
md_file = str(path).replace('.rst', '.md')
17-
subprocess.check_call(['git', 'mv', path, md_file])
18-
subprocess.check_call(['git', 'add', md_file])
19-
subprocess.check_call(['git', 'commit', '--no-verify', '-m', f'Rename {path} to {md_file}'])
20-
subprocess.check_call(['git', 'checkout', 'HEAD~1', path])
21-
subprocess.check_call(['git', 'add', path])
16+
md_file = str(path).replace(".rst", ".md")
17+
subprocess.check_call(["git", "mv", path, md_file])
18+
subprocess.check_call(["git", "add", md_file])
19+
subprocess.check_call(
20+
["git", "commit", "--no-verify", "-m", f"Rename {path} to {md_file}"]
21+
)
22+
subprocess.check_call(["git", "checkout", "HEAD~1", path])
23+
subprocess.check_call(["git", "add", path])
2224

2325
# Get the contents of the file.
2426
with path.open() as fid:
@@ -31,45 +33,47 @@
3133
"""
3234

3335
# Update the RST file with a stub pointer to the MD file.
34-
if not path.name == 'README.rst':
36+
if not path.name == "README.rst":
3537
new_body = TEMPLATE.format(os.path.basename(md_file))
36-
with path.open('w') as fid:
37-
fid.write(''.join(new_body))
38+
with path.open("w") as fid:
39+
fid.write("".join(new_body))
3840

3941
# Pre-process the file.
40-
for (i, line) in enumerate(lines):
42+
for i, line in enumerate(lines):
4143
# Replace curly quotes with regular quotes.
42-
line = line.replace('”', '"')
43-
line = line.replace('“', '"')
44-
line = line.replace('’', "'")
45-
line = line.replace('‘', "'")
44+
line = line.replace("”", '"')
45+
line = line.replace("“", '"')
46+
line = line.replace("’", "'")
47+
line = line.replace("‘", "'")
4648
lines[i] = line
4749

4850
# Replace the colon fence blocks with bullets,
4951
# e.g. :Status:, :deprecated:, :changed:.
5052
# This also includes the changelog entries.
51-
match = re.match(r':(\S+):(.*)', line)
53+
match = re.match(r":(\S+):(.*)", line)
5254
if match:
5355
name, value = match.groups()
54-
lines[i] = f'- {name.capitalize()}:{value}\n'
56+
lines[i] = f"- {name.capitalize()}:{value}\n"
5557

5658
# Handle "":Minimum Server Version:"" as a block quote.
57-
if line.strip().startswith(':Minimum Server Version:'):
58-
lines[i] = '- ' + line.strip()[1:] + ''
59+
if line.strip().startswith(":Minimum Server Version:"):
60+
lines[i] = "- " + line.strip()[1:] + ""
5961

6062
# Remove the "".. contents::" block - handled by GitHub UI.
61-
if line.strip() == '.. contents::':
62-
lines[i] = ''
63+
if line.strip() == ".. contents::":
64+
lines[i] = ""
6365

6466
# Run pandoc and capture output.
65-
proc = subprocess.Popen(['pandoc', '-f', 'rst', '-t', 'gfm'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
66-
data = ''.join(lines).encode('utf8')
67+
proc = subprocess.Popen(
68+
["pandoc", "-f", "rst", "-t", "gfm"], stdin=subprocess.PIPE, stdout=subprocess.PIPE
69+
)
70+
data = "".join(lines).encode("utf8")
6771
outs, _ = proc.communicate(data)
68-
data = outs.decode('utf8')
72+
data = outs.decode("utf8")
6973

7074
# Fix the strings that were missing backticks.
71-
data = re.sub(r'<span\W+class="title-ref">', '`', data, flags=re.MULTILINE)
72-
data = data.replace('</span>', '`')
75+
data = re.sub(r'<span\W+class="title-ref">', "`", data, flags=re.MULTILINE)
76+
data = data.replace("</span>", "`")
7377

7478
# Handle div blocks that were created.
7579
# These are admonition blocks, convert to new GFM format.
@@ -79,55 +83,55 @@
7983
in_changelog_first = False
8084
lines = data.splitlines()
8185
new_lines = []
82-
for (i, line) in enumerate(lines):
83-
match = re.match(r'<div class="(\S+)">',line)
86+
for i, line in enumerate(lines):
87+
match = re.match(r'<div class="(\S+)">', line)
8488
if not in_block_outer and match:
8589
in_block_outer = True
86-
new_lines.append(f'> [!{match.groups()[0].upper()}]')
90+
new_lines.append(f"> [!{match.groups()[0].upper()}]")
8791
continue
88-
if line.strip() == '</div>':
92+
if line.strip() == "</div>":
8993
if in_block_outer:
9094
in_block_outer = False
9195
in_block_inner = True
9296
elif in_block_inner:
9397
in_block_inner = False
9498
continue
9599
if in_block_inner:
96-
line = '> ' + line.strip()
100+
line = "> " + line.strip()
97101

98102
if in_changelog_first:
99-
today = datetime.date.today().strftime('%Y-%m-%d')
100-
line = f'\n- {today}: Migrated from reStructuredText to Markdown.'
103+
today = datetime.date.today().strftime("%Y-%m-%d")
104+
line = f"\n- {today}: Migrated from reStructuredText to Markdown."
101105
in_changelog_first = False
102106

103-
if line.strip() == '## Changelog':
107+
if line.strip() == "## Changelog":
104108
in_changelog_first = True
105109

106110
if not in_block_outer:
107-
new_lines.append(line)
111+
new_lines.append(line)
108112

109113

110114
# Write the new content to the markdown file.
111-
with open(md_file, 'w') as fid:
112-
fid.write('\n'.join(new_lines))
115+
with open(md_file, "w") as fid:
116+
fid.write("\n".join(new_lines))
113117

114118
# Handle links in other files.
115-
# We accept relative path links or links to master
119+
# We accept relative path links or links to master
116120
# (https://github.com/mongodb/specifications/blob/master/source/...)
117121
# and rewrite them to use appropriate md links.
118122
# If the link is malformed we ignore and print an error.
119123
target = path.name
120124
curr = path
121-
while curr.parent.name != 'source':
122-
target = f'{curr.parent.name}/{target}'
125+
while curr.parent.name != "source":
126+
target = f"{curr.parent.name}/{target}"
123127
curr = curr.parent
124-
suffix = fr'\S*/{target}'
125-
rel_pattern = re.compile(fr'(\.\.{suffix})')
126-
md_pattern = re.compile(fr'(\(http{suffix})')
127-
html_pattern = re.compile(f'(http{suffix})')
128-
abs_pattern = re.compile(f'(/source{suffix})')
128+
suffix = rf"\S*/{target}"
129+
rel_pattern = re.compile(rf"(\.\.{suffix})")
130+
md_pattern = re.compile(rf"(\(http{suffix})")
131+
html_pattern = re.compile(f"(http{suffix})")
132+
abs_pattern = re.compile(f"(/source{suffix})")
129133
for p in Path("source").rglob("*"):
130-
if p.suffix not in ['.rst', '.md']:
134+
if p.suffix not in [".rst", ".md"]:
131135
continue
132136
with p.open() as fid:
133137
lines = fid.readlines()
@@ -141,16 +145,20 @@
141145
new_line = line.replace(matchstr, relpath)
142146
elif re.search(md_pattern, line):
143147
matchstr = re.search(md_pattern, line).groups()[0]
144-
if not matchstr.startswith('(https://github.com/mongodb/specifications/blob/master/source'):
145-
print('*** Error in link: ', matchstr, p)
148+
if not matchstr.startswith(
149+
"(https://github.com/mongodb/specifications/blob/master/source"
150+
):
151+
print("*** Error in link: ", matchstr, p)
146152
else:
147-
new_line = line.replace(matchstr, f'({relpath}')
153+
new_line = line.replace(matchstr, f"({relpath}")
148154
elif re.search(html_pattern, line):
149155
matchstr = re.search(html_pattern, line).groups()[0]
150-
if not matchstr.startswith('https://github.com/mongodb/specifications/blob/master/source'):
151-
print('*** Error in link: ', matchstr, p)
156+
if not matchstr.startswith(
157+
"https://github.com/mongodb/specifications/blob/master/source"
158+
):
159+
print("*** Error in link: ", matchstr, p)
152160
else:
153-
new_line = line.replace(matchstr, f'{relpath}')
161+
new_line = line.replace(matchstr, f"{relpath}")
154162
elif re.search(abs_pattern, line):
155163
matchstr = re.search(abs_pattern, line).groups()[0]
156164
new_line = line.replace(matchstr, relpath)
@@ -160,11 +168,11 @@
160168
new_lines.append(new_line)
161169

162170
if changed_lines:
163-
with p.open('w') as fid:
171+
with p.open("w") as fid:
164172
fid.writelines(new_lines)
165-
print('-' * 80)
166-
print(f'Updated link(s) in {p}...')
167-
print(' ' + '\n '.join(changed_lines))
173+
print("-" * 80)
174+
print(f"Updated link(s) in {p}...")
175+
print(" " + "\n ".join(changed_lines))
168176

169-
print('Created markdown file:')
177+
print("Created markdown file:")
170178
print(md_file)

0 commit comments

Comments
 (0)