Skip to content

Commit 579b58d

Browse files
doublethefishPierre-Sassoulas
authored andcommitted
mapreduce| Adds map/reduce functionality to SimilarChecker
Before adding a new mixin this proves the concept works, adding tests as examples of how this would work in the main linter. The idea here is that, because `check_parallel()` uses a multiprocess `map` function, that the natural follow on is to use a 'reduce` paradigm. This should demonstrate that.
1 parent 854a7f5 commit 579b58d

File tree

4 files changed

+267
-1
lines changed

4 files changed

+267
-1
lines changed

pylint/checkers/similar.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,20 @@ def _iter_sims(self):
160160
for lineset2 in self.linesets[idx + 1 :]:
161161
yield from self._find_common(lineset, lineset2)
162162

163+
def get_map_data(self):
164+
"""Returns the data we can use for a map/reduce process
165+
166+
In this case we are returning this instance's Linesets, that is all file
167+
information that will later be used for vectorisation.
168+
"""
169+
return self.linesets
170+
171+
def combine_mapreduce_data(self, linesets_collection):
172+
"""Reduces and recombines data into a format that we can report on
173+
174+
The partner function of get_map_data()"""
175+
self.linesets = [line for lineset in linesets_collection for line in lineset]
176+
163177

164178
def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
165179
"""return lines with leading/trailing whitespace and any ignored code
@@ -352,7 +366,7 @@ def __init__(self, linter=None):
352366
def set_option(self, optname, value, action=None, optdict=None):
353367
"""method called to set an option (registered in the options list)
354368
355-
overridden to report options setting to Similar
369+
Overridden to report options setting to Similar
356370
"""
357371
BaseChecker.set_option(self, optname, value, action, optdict)
358372
if optname == "min-similarity-lines":
@@ -402,6 +416,20 @@ def close(self):
402416
stats["nb_duplicated_lines"] = duplicated
403417
stats["percent_duplicated_lines"] = total and duplicated * 100.0 / total
404418

419+
def get_map_data(self):
420+
""" Passthru override """
421+
return Similar.get_map_data(self)
422+
423+
@classmethod
424+
def reduce_map_data(cls, linter, data):
425+
"""Reduces and recombines data into a format that we can report on
426+
427+
The partner function of get_map_data()"""
428+
recombined = SimilarChecker(linter)
429+
recombined.open()
430+
Similar.combine_mapreduce_data(recombined, linesets_collection=data)
431+
recombined.close()
432+
405433

406434
def register(linter):
407435
"""required method to auto register this checker """

tests/checkers/unittest_similar.py

+139
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import pytest
2222

2323
from pylint.checkers import similar
24+
from pylint.lint import PyLinter
25+
from pylint.testutils import GenericTestReporter as Reporter
2426

2527
INPUT = Path(__file__).parent / ".." / "input"
2628
SIMILAR1 = str(INPUT / "similar1")
@@ -234,3 +236,140 @@ def test_no_args():
234236
assert ex.code == 1
235237
else:
236238
pytest.fail("not system exit")
239+
240+
241+
def test_get_map_data():
242+
"""Tests that a SimilarChecker respects the MapReduceMixin interface"""
243+
linter = PyLinter(reporter=Reporter())
244+
245+
# Add a parallel checker to ensure it can map and reduce
246+
linter.register_checker(similar.SimilarChecker(linter))
247+
248+
source_streams = (
249+
str(INPUT / "similar_lines_a.py"),
250+
str(INPUT / "similar_lines_b.py"),
251+
)
252+
expected_linelists = (
253+
(
254+
"",
255+
"",
256+
"",
257+
"",
258+
"",
259+
"",
260+
"def adipiscing(elit):",
261+
'etiam = "id"',
262+
'dictum = "purus,"',
263+
'vitae = "pretium"',
264+
'neque = "Vivamus"',
265+
'nec = "ornare"',
266+
'tortor = "sit"',
267+
"return etiam, dictum, vitae, neque, nec, tortor",
268+
"",
269+
"",
270+
"class Amet:",
271+
"def similar_function_3_lines(self, tellus):",
272+
"agittis = 10",
273+
"tellus *= 300",
274+
"return agittis, tellus",
275+
"",
276+
"def lorem(self, ipsum):",
277+
'dolor = "sit"',
278+
'amet = "consectetur"',
279+
"return (lorem, dolor, amet)",
280+
"",
281+
"def similar_function_5_lines(self, similar):",
282+
"some_var = 10",
283+
"someother_var *= 300",
284+
'fusce = "sit"',
285+
'amet = "tortor"',
286+
"return some_var, someother_var, fusce, amet",
287+
"",
288+
'def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"):',
289+
'metus = "ut"',
290+
'lobortis = "urna."',
291+
'Integer = "nisl"',
292+
'(mauris,) = "interdum"',
293+
'non = "odio"',
294+
'semper = "aliquam"',
295+
'malesuada = "nunc."',
296+
'iaculis = "dolor"',
297+
'facilisis = "ultrices"',
298+
'vitae = "ut."',
299+
"",
300+
"return (",
301+
"metus,",
302+
"lobortis,",
303+
"Integer,",
304+
"mauris,",
305+
"non,",
306+
"semper,",
307+
"malesuada,",
308+
"iaculis,",
309+
"facilisis,",
310+
"vitae,",
311+
")",
312+
"",
313+
"def similar_function_3_lines(self, tellus):",
314+
"agittis = 10",
315+
"tellus *= 300",
316+
"return agittis, tellus",
317+
),
318+
(
319+
"",
320+
"",
321+
"",
322+
"",
323+
"",
324+
"",
325+
"",
326+
"class Nulla:",
327+
'tortor = "ultrices quis porta in"',
328+
'sagittis = "ut tellus"',
329+
"",
330+
"def pulvinar(self, blandit, metus):",
331+
"egestas = [mauris for mauris in zip(blandit, metus)]",
332+
"neque = (egestas, blandit)",
333+
"",
334+
"def similar_function_5_lines(self, similar):",
335+
"some_var = 10",
336+
"someother_var *= 300",
337+
'fusce = "sit"',
338+
'amet = "tortor"',
339+
'iaculis = "dolor"',
340+
"return some_var, someother_var, fusce, amet, iaculis, iaculis",
341+
"",
342+
"",
343+
"def tortor(self):",
344+
"ultrices = 2",
345+
'quis = ultricies * "porta"',
346+
"return ultricies, quis",
347+
"",
348+
"",
349+
"class Commodo:",
350+
"def similar_function_3_lines(self, tellus):",
351+
"agittis = 10",
352+
"tellus *= 300",
353+
'laoreet = "commodo "',
354+
"return agittis, tellus, laoreet",
355+
),
356+
)
357+
358+
data = []
359+
360+
# Manually perform a 'map' type function
361+
for source_fname in source_streams:
362+
sim = similar.SimilarChecker(linter)
363+
with open(source_fname) as stream:
364+
sim.append_stream(source_fname, stream)
365+
# The map bit, can you tell? ;)
366+
data.extend(sim.get_map_data())
367+
368+
assert len(expected_linelists) == len(data)
369+
for source_fname, expected_lines, lineset_obj in zip(
370+
source_streams, expected_linelists, data
371+
):
372+
assert source_fname == lineset_obj.name
373+
# There doesn't seem to be a faster way of doing this, yet.
374+
lines = (line for idx, line in lineset_obj.enumerate_stripped())
375+
assert tuple(expected_lines) == tuple(lines)

tests/input/similar_lines_a.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
""" A file designed to have lines of similarity when compared to similar_lines_b
2+
3+
We use lorm-ipsum to generate 'random' code. """
4+
# Copyright (c) 2020 Frank Harrison <[email protected]>
5+
6+
7+
def adipiscing(elit):
8+
etiam = "id"
9+
dictum = "purus,"
10+
vitae = "pretium"
11+
neque = "Vivamus"
12+
nec = "ornare"
13+
tortor = "sit"
14+
return etiam, dictum, vitae, neque, nec, tortor
15+
16+
17+
class Amet:
18+
def similar_function_3_lines(self, tellus): # line same #1
19+
agittis = 10 # line same #2
20+
tellus *= 300 # line same #3
21+
return agittis, tellus # line diff
22+
23+
def lorem(self, ipsum):
24+
dolor = "sit"
25+
amet = "consectetur"
26+
return (lorem, dolor, amet)
27+
28+
def similar_function_5_lines(self, similar): # line same #1
29+
some_var = 10 # line same #2
30+
someother_var *= 300 # line same #3
31+
fusce = "sit" # line same #4
32+
amet = "tortor" # line same #5
33+
return some_var, someother_var, fusce, amet # line diff
34+
35+
def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"):
36+
metus = "ut"
37+
lobortis = "urna."
38+
Integer = "nisl"
39+
(mauris,) = "interdum"
40+
non = "odio"
41+
semper = "aliquam"
42+
malesuada = "nunc."
43+
iaculis = "dolor"
44+
facilisis = "ultrices"
45+
vitae = "ut."
46+
47+
return (
48+
metus,
49+
lobortis,
50+
Integer,
51+
mauris,
52+
non,
53+
semper,
54+
malesuada,
55+
iaculis,
56+
facilisis,
57+
vitae,
58+
)
59+
60+
def similar_function_3_lines(self, tellus): # line same #1
61+
agittis = 10 # line same #2
62+
tellus *= 300 # line same #3
63+
return agittis, tellus # line diff

tests/input/similar_lines_b.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
""" The sister file of similar_lines_a, another file designed to have lines of
2+
similarity when compared to its sister file
3+
4+
As with the sister file, we use lorm-ipsum to generate 'random' code. """
5+
# Copyright (c) 2020 Frank Harrison <[email protected]>
6+
7+
8+
class Nulla:
9+
tortor = "ultrices quis porta in"
10+
sagittis = "ut tellus"
11+
12+
def pulvinar(self, blandit, metus):
13+
egestas = [mauris for mauris in zip(blandit, metus)]
14+
neque = (egestas, blandit)
15+
16+
def similar_function_5_lines(self, similar): # line same #1
17+
some_var = 10 # line same #2
18+
someother_var *= 300 # line same #3
19+
fusce = "sit" # line same #4
20+
amet = "tortor" # line same #5
21+
iaculis = "dolor" # line diff
22+
return some_var, someother_var, fusce, amet, iaculis, iaculis # line diff
23+
24+
25+
def tortor(self):
26+
ultrices = 2
27+
quis = ultricies * "porta"
28+
return ultricies, quis
29+
30+
31+
class Commodo:
32+
def similar_function_3_lines(self, tellus): # line same #1
33+
agittis = 10 # line same #2
34+
tellus *= 300 # line same #3
35+
laoreet = "commodo " # line diff
36+
return agittis, tellus, laoreet # line diff

0 commit comments

Comments
 (0)