-
Notifications
You must be signed in to change notification settings - Fork 5.6k
/
Copy pathcheck_for_noexcept.py
387 lines (309 loc) · 14 KB
/
check_for_noexcept.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#!/usr/bin/env python3
"""
Check for changes in noexcept usage between the given revisions and output a file that
can be used by evergreen to post a github commit check annotating a PR with the results.
The algorithm here basically handles two cases: (1) straightforward cases where
you're "just" adding or removing noexcept from an existing function; and (2)
some cases where you're adding a new noexcept function or functions. Case (1)
works like this: if we have a diff block (that is, a block of addition or
deletion) that's a single line and mentions noexcept on only one side, count
it. Case (2) works like this: if noexcept is mentioned on the b-side of the
diff and not the a-side, assume new noexcept code has been added, and count it.
"""
import difflib
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass
from enum import Enum, auto
from typing import Annotated, Any, Optional
import git
import structlog
import typer
from structlog.stdlib import LoggerFactory
structlog.configure(logger_factory=LoggerFactory())
LOGGER = structlog.getLogger(__name__)
DOCUMENTATION_URL = (
"https://github.com/10gen/mongo/blob/master/docs/exception_architecture.md#using-noexcept"
)
# These fields go into the checks tab, they do not appear inline in the "Files Changed" tab.
# This appears whether or not the check flags any noexcept changes.
TITLE_TEXT = "Changes in noexcept usage"
SUMMARY_TEXT = (
"This check looks for changes in noexcept usage in order to link users to the "
"guidelines for its usage [here](" + DOCUMENTATION_URL + ") "
"when such changes are identified."
)
# These fields go in the annotations in the "Files Changed" tab. These fields are rendered
# verbatim (i.e., markdown doesn't work).
ANNOTATION_TITLE_TEXT = TITLE_TEXT
ANNOTATION_MESSAGE_TEXT = (
"A change in noexcept usage has been detected.\n"
"As noexcept is a frequently-misunderstood feature,\n"
"please refer to the documentation on noexcept usage\n"
"linked below to evaluate if this change is appropriate. "
"\n\n"
"This message will only be posted once per PR, even if\n"
"there are multiple detected changes to noexcept usage. "
"\n\n"
"This may be a false positive, such as on move operations.\n"
"If so, disregard this message. "
"\n\n"
"Documentation can be found here: " + DOCUMENTATION_URL
)
def configure_logging() -> None:
logging.basicConfig(
format="[%(asctime)s - %(name)s - %(levelname)s] %(message)s",
level=logging.INFO,
stream=sys.stderr,
)
def get_merge_base(repo: git.Repo, base, rev) -> git.DiffIndex:
merge_bases = repo.merge_base(rev, base)
if len(merge_bases) == 0:
raise RuntimeError(f"No common merge base between {base} and {rev}")
assert len(merge_bases) == 1
return merge_bases[0]
def get_git_diff(repo: git.Repo, base: git.Commit, rev) -> git.DiffIndex:
return base.diff(rev)
class ChangeKind(Enum):
ADDITION = auto()
REMOVAL = auto()
@dataclass
class UnboundNoexceptChange:
kind: ChangeKind
is_certain: bool
index: int
def bind_to_line(self, line: int):
return LineBoundNoexceptChange(**asdict(self), line=line)
@dataclass
class LineBoundNoexceptChange(UnboundNoexceptChange):
line: int
def bind_to_file(self, file: str):
return FileBoundNoexceptChange(**asdict(self), file=file)
@dataclass
class FileBoundNoexceptChange(LineBoundNoexceptChange):
file: str
def make_annotation(self) -> dict[str, Any]:
return {
"path": self.file,
"annotation_level": "notice",
"title": ANNOTATION_TITLE_TEXT,
"message": ANNOTATION_MESSAGE_TEXT,
"raw_details": "",
"start_line": self.line,
"end_line": self.line,
}
def get_line_for_char(text: str, i: int) -> int:
# The first line (before any newlines) is line 1 rather than line 0.
return text[:i].count("\n") + 1
def analyze_uncertain_insdel(insdel: str, kind: ChangeKind) -> list[UnboundNoexceptChange]:
return [
UnboundNoexceptChange(kind=kind, is_certain=False, index=match.start())
for match in re.finditer("noexcept", insdel)
]
def analyze_insdel(insdel: str, kind: ChangeKind) -> list[UnboundNoexceptChange]:
# If the inserted or deleted string is a single line, we'll assume the presence of noexcept
# indicates a true insertion or deletion of noexcept.
if insdel.count("noexcept") == 1 and insdel.count("\n") == 0:
return [UnboundNoexceptChange(kind=kind, is_certain=True, index=insdel.find("noexcept"))]
return analyze_uncertain_insdel(insdel, kind)
def analyze_deletion(deleted: str) -> list[UnboundNoexceptChange]:
return analyze_insdel(deleted, ChangeKind.REMOVAL)
def analyze_insertion(inserted: str) -> list[UnboundNoexceptChange]:
return analyze_insdel(inserted, ChangeKind.ADDITION)
def analyze_edit(deleted: str, inserted: str) -> list[UnboundNoexceptChange]:
if "noexcept" not in inserted:
return analyze_deletion(deleted)
if "noexcept" not in deleted:
return analyze_insertion(inserted)
# Both sides mention noexcept - we have far less certainty.
deletions = analyze_uncertain_insdel(deleted, ChangeKind.REMOVAL)
insertions = analyze_uncertain_insdel(inserted, ChangeKind.ADDITION)
return insertions + deletions
# This is a wrapper around difflib.SeqeuenceMatcher's get_matching_blocks.
# SequenceMatcher synchronizes the sequences quite aggressively when diffing a character sequence,
# often inserting a very small match in the middle of what we'd like to consider a single edit.
# For example, considering the strings "int f() noexcept const" and "int f() const",
# it might align them like this:
# int f() noexcept const
# int f() c onst
# producing two insertions of "noex" and "ept", making it hard for us to analyze individual edits.
# This wrapper performs the match at the level of words and translates back to a character-based
# match sequence. The matching ignores whitespace, so matching regions may be of different sizes
# on each side and may indeed differ in whitespace.
def get_matching_blocks(lhs: str, rhs: str) -> list[tuple[int, int, int, int]]:
def skip_whitespace(s: str, i: int) -> int:
while i < len(s) and s[i].isspace():
i += 1
return i
def skip_word(s: str, word: str, i: int) -> int:
for char in word:
assert i < len(s)
assert s[i] == char
i += 1
return i
def advance(s: str, word: str, i: int) -> int:
i = skip_whitespace(s, i)
i = skip_word(s, word, i)
i = skip_whitespace(s, i)
return i
lhs_words = lhs.split()
rhs_words = rhs.split()
s = difflib.SequenceMatcher(None, lhs_words, rhs_words, autojunk=False)
matching_blocks = s.get_matching_blocks()
lhs_word_index = 0
lhs_char_index = 0
rhs_word_index = 0
rhs_char_index = 0
new_blocks = []
for block in matching_blocks:
if block.size == 0:
# The last block is a dummy block a and b set to the sequence lengths and size 0.
# We don't return such a block since we don't need it.
break
while lhs_word_index < block.a:
lhs_char_index = advance(lhs, lhs_words[lhs_word_index], lhs_char_index)
lhs_word_index += 1
while rhs_word_index < block.b:
rhs_char_index = advance(rhs, rhs_words[rhs_word_index], rhs_char_index)
rhs_word_index += 1
block_lhs_start = lhs_char_index
while lhs_word_index < block.a + block.size:
lhs_char_index = advance(lhs, lhs_words[lhs_word_index], lhs_char_index)
lhs_word_index += 1
block_rhs_start = rhs_char_index
while rhs_word_index < block.b + block.size:
rhs_char_index = advance(rhs, rhs_words[rhs_word_index], rhs_char_index)
rhs_word_index += 1
new_blocks.append((block_lhs_start, lhs_char_index, block_rhs_start, rhs_char_index))
return new_blocks
def get_nonmatching_blocks(lhs: str, rhs: str) -> list[tuple[int, int, int, int]]:
matches = get_matching_blocks(lhs, rhs)
lhs_i = 0
rhs_i = 0
blocks = []
for lhs_start, lhs_end, rhs_start, rhs_end in matches:
if lhs_i < lhs_start or rhs_i < rhs_start:
blocks.append((lhs_i, lhs_start, rhs_i, rhs_start))
lhs_i = lhs_end
rhs_i = rhs_end
if lhs_i < len(lhs) or rhs_i < len(rhs):
blocks.append((lhs_i, len(lhs), rhs_i, len(rhs)))
return blocks
def analyze_text_diff(lhs: str, rhs: str) -> list[LineBoundNoexceptChange]:
blocks = get_nonmatching_blocks(lhs, rhs)
changes = []
for lhs_start, lhs_end, rhs_start, rhs_end in blocks:
deleted = lhs[lhs_start:lhs_end]
inserted = rhs[rhs_start:rhs_end]
block_changes = analyze_edit(deleted, inserted)
# Line number always refers to the rhs of the diff (post-changes). Annotations can only
# be put on that side.
rhs_start_line = get_line_for_char(rhs, rhs_start)
# Indices are relative to the start of inserted/deleted, convert them to be relative
# to lhs/rhs.
for change in block_changes:
if change.kind == ChangeKind.ADDITION:
# For additions, we bind to the line where noexcept appears.
change.index += rhs_start
changes += [change.bind_to_line(get_line_for_char(rhs, change.index))]
elif change.kind == ChangeKind.REMOVAL:
# For removals, we bind to the start of the diff block due to the constraint that
# annotations go on the right-hand side.
change.index += lhs_start
changes += [change.bind_to_line(rhs_start_line)]
return changes
def is_allowed_path(path: str) -> bool:
if not path.startswith("src/mongo"):
return False
if not (path.endswith(".h") or path.endswith(".cpp")):
return False
return True
def analyze_single_git_diff(diff: git.Diff) -> list[LineBoundNoexceptChange]:
# Blobs can be None. For example, if the diff is a file deletion, the b_blob will be None.
# In that case, we want the None to be treated as empty string for our text diff.
def decode(blob: git.Blob | None) -> str:
if not blob:
return ""
return blob.data_stream.read().decode("utf-8")
if not is_allowed_path(diff.a_path):
return []
if not is_allowed_path(diff.b_path):
return []
return analyze_text_diff(decode(diff.a_blob), decode(diff.b_blob))
def analyze_git_diff(diffs: list[git.Diff]) -> list[FileBoundNoexceptChange]:
changes = []
for diff in diffs:
diff_changes = analyze_single_git_diff(diff)
changes += [change.bind_to_file(diff.b_path) for change in diff_changes]
return changes
def make_check_result(changes: list[FileBoundNoexceptChange]) -> dict[str, Any]:
annotations = [change.make_annotation() for change in changes]
return {
"title": TITLE_TEXT,
"summary": SUMMARY_TEXT,
"text": "",
"annotations": annotations,
}
def check_for_noexcept_changes(diffs: list[git.Diff]) -> dict[str, Any]:
changes = analyze_git_diff(diffs)
# If we're certain that noexcept was added or removed in some places, make annotations based
# on the changes we're sure of.
certain_changes = [change for change in changes if change.is_certain]
if len(certain_changes) > 0:
return make_check_result(certain_changes)
# We've detected changes involving noexcept, but we're not certain that they're meaningful.
# For example, removing a noexcept function altogether would be picked up as a noexcept
# deletion, but it's not one we want to report on.
# Conservatively, we'll only report on uncertain noexcept changes if there are additions of
# noexcept with _no_ removals. This is still not perfect - we might pick up comments, or
# someone declaring a noexcept function that already has a definition elsewhere, for example.
additions = [change for change in changes if change.kind == ChangeKind.ADDITION]
removals = [change for change in changes if change.kind == ChangeKind.REMOVAL]
if len(additions) > 0 and len(removals) == 0:
return make_check_result(additions)
# There are either no changes to noexcept or a mix of added and removed noexcepts. We'd need a
# deeper analysis to figure out the mixed case.
return make_check_result([])
def main(
revision: Annotated[
str, typer.Argument(help="Git revision to check for noexcept changes")
] = "HEAD",
base: Annotated[
Optional[str], typer.Option("--base", "-b", help="Git revision to compare against")
] = None,
output_path: Annotated[
str, typer.Option("--output", "-o", help="Path at which to output github annotations")
] = "../github_annotations.json",
max_annotations: Annotated[
int, typer.Option("--limit", help="Maximum number of annotations to output")
] = 1,
):
"""
Check for changes in noexcept usage between the given revisions and output a file that
can be used by evergreen to post a github commit check annotating a PR with the results.
"""
configure_logging()
repo = git.Repo(".")
if base is None:
base = repo.head.commit
merge_base = get_merge_base(repo, base, revision)
diffs = get_git_diff(repo, merge_base, revision)
LOGGER.info(
"Checking for noexcept changes",
base=str(base),
merge_base=str(merge_base),
revision=revision,
)
check_result = check_for_noexcept_changes(diffs)
# Cap annotations to the given limit - we only want to output one in the commit queue, but
# for testing it's useful to see all of them.
if "annotations" in check_result:
annotations = check_result.get("annotations", [])
if max_annotations > 0 and len(annotations) > max_annotations:
check_result["annotations"] = annotations[:max_annotations]
with open(output_path, "w") as f:
json.dump(check_result, f, indent=4)
if __name__ == "__main__":
typer.run(main)