From 4d720be929b5783b03c4fa745584e086a56cca7e Mon Sep 17 00:00:00 2001 From: Frank Harrison Date: Fri, 27 Mar 2020 07:39:29 +0000 Subject: [PATCH 1/3] mapreduce| chore| Updating ChangeLog/Contributors/Release notes --- CONTRIBUTORS.txt | 2 ++ ChangeLog | 4 ++++ doc/whatsnew/2.6.rst | 2 ++ 3 files changed, 8 insertions(+) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index e3013b912a..662830776c 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -433,3 +433,5 @@ contributors: * Gergely Kalmár: contributor * Batuhan Taskaya: contributor + +* Frank Harrison (doublethefish): contributor diff --git a/ChangeLog b/ChangeLog index 718db35da4..734d262f9b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -107,6 +107,10 @@ Release date: TBA Close #1879 +* Fixes duplicate-errors not working with -j2+ + + Close #3314 + What's New in Pylint 2.6.0? =========================== diff --git a/doc/whatsnew/2.6.rst b/doc/whatsnew/2.6.rst index 13929ecc71..cce1caa43f 100644 --- a/doc/whatsnew/2.6.rst +++ b/doc/whatsnew/2.6.rst @@ -58,3 +58,5 @@ Other Changes .. _the migration guide in isort documentation: https://timothycrosley.github.io/isort/docs/upgrade_guides/5.0.0/#known_standard_library * `len-as-conditions` is now triggered only for classes that are inheriting directly from list, dict, or set and not implementing the `__bool__` function, or from generators like range or list/dict/set comprehension. This should reduce the false positive for other classes, like pandas's DataFrame or numpy's Array. + +* Fixes duplicate code detection for --jobs=2+ From 4d07153fd413db2c0b4f3d37aab9cac19a255c35 Mon Sep 17 00:00:00 2001 From: Frank Harrison Date: Thu, 26 Mar 2020 10:50:55 +0000 Subject: [PATCH 2/3] mapreduce| Adds map/reduce functionality to SimilarChecker Before adding a new mixin this proves the concept works, adding tests as examples of how this would work in the main linter. The idea here is that, because `check_parallel()` uses a multiprocess `map` function, that the natural follow on is to use a 'reduce` paradigm. This should demonstrate that. --- pylint/checkers/similar.py | 30 ++++++- tests/checkers/unittest_similar.py | 139 +++++++++++++++++++++++++++++ tests/input/similar_lines_a.py | 63 +++++++++++++ tests/input/similar_lines_b.py | 36 ++++++++ 4 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 tests/input/similar_lines_a.py create mode 100644 tests/input/similar_lines_b.py diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py index 82f79e8cc8..3ac071bb3c 100644 --- a/pylint/checkers/similar.py +++ b/pylint/checkers/similar.py @@ -160,6 +160,20 @@ def _iter_sims(self): for lineset2 in self.linesets[idx + 1 :]: yield from self._find_common(lineset, lineset2) + def get_map_data(self): + """Returns the data we can use for a map/reduce process + + In this case we are returning this instance's Linesets, that is all file + information that will later be used for vectorisation. + """ + return self.linesets + + def combine_mapreduce_data(self, linesets_collection): + """Reduces and recombines data into a format that we can report on + + The partner function of get_map_data()""" + self.linesets = [line for lineset in linesets_collection for line in lineset] + def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports): """return lines with leading/trailing whitespace and any ignored code @@ -352,7 +366,7 @@ def __init__(self, linter=None): def set_option(self, optname, value, action=None, optdict=None): """method called to set an option (registered in the options list) - overridden to report options setting to Similar + Overridden to report options setting to Similar """ BaseChecker.set_option(self, optname, value, action, optdict) if optname == "min-similarity-lines": @@ -402,6 +416,20 @@ def close(self): stats["nb_duplicated_lines"] = duplicated stats["percent_duplicated_lines"] = total and duplicated * 100.0 / total + def get_map_data(self): + """ Passthru override """ + return Similar.get_map_data(self) + + @classmethod + def reduce_map_data(cls, linter, data): + """Reduces and recombines data into a format that we can report on + + The partner function of get_map_data()""" + recombined = SimilarChecker(linter) + recombined.open() + Similar.combine_mapreduce_data(recombined, linesets_collection=data) + recombined.close() + def register(linter): """required method to auto register this checker """ diff --git a/tests/checkers/unittest_similar.py b/tests/checkers/unittest_similar.py index ed4af2f5cd..ebc5c3ba1c 100644 --- a/tests/checkers/unittest_similar.py +++ b/tests/checkers/unittest_similar.py @@ -21,6 +21,8 @@ import pytest from pylint.checkers import similar +from pylint.lint import PyLinter +from pylint.testutils import GenericTestReporter as Reporter INPUT = Path(__file__).parent / ".." / "input" SIMILAR1 = str(INPUT / "similar1") @@ -234,3 +236,140 @@ def test_no_args(): assert ex.code == 1 else: pytest.fail("not system exit") + + +def test_get_map_data(): + """Tests that a SimilarChecker respects the MapReduceMixin interface""" + linter = PyLinter(reporter=Reporter()) + + # Add a parallel checker to ensure it can map and reduce + linter.register_checker(similar.SimilarChecker(linter)) + + source_streams = ( + str(INPUT / "similar_lines_a.py"), + str(INPUT / "similar_lines_b.py"), + ) + expected_linelists = ( + ( + "", + "", + "", + "", + "", + "", + "def adipiscing(elit):", + 'etiam = "id"', + 'dictum = "purus,"', + 'vitae = "pretium"', + 'neque = "Vivamus"', + 'nec = "ornare"', + 'tortor = "sit"', + "return etiam, dictum, vitae, neque, nec, tortor", + "", + "", + "class Amet:", + "def similar_function_3_lines(self, tellus):", + "agittis = 10", + "tellus *= 300", + "return agittis, tellus", + "", + "def lorem(self, ipsum):", + 'dolor = "sit"', + 'amet = "consectetur"', + "return (lorem, dolor, amet)", + "", + "def similar_function_5_lines(self, similar):", + "some_var = 10", + "someother_var *= 300", + 'fusce = "sit"', + 'amet = "tortor"', + "return some_var, someother_var, fusce, amet", + "", + 'def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"):', + 'metus = "ut"', + 'lobortis = "urna."', + 'Integer = "nisl"', + '(mauris,) = "interdum"', + 'non = "odio"', + 'semper = "aliquam"', + 'malesuada = "nunc."', + 'iaculis = "dolor"', + 'facilisis = "ultrices"', + 'vitae = "ut."', + "", + "return (", + "metus,", + "lobortis,", + "Integer,", + "mauris,", + "non,", + "semper,", + "malesuada,", + "iaculis,", + "facilisis,", + "vitae,", + ")", + "", + "def similar_function_3_lines(self, tellus):", + "agittis = 10", + "tellus *= 300", + "return agittis, tellus", + ), + ( + "", + "", + "", + "", + "", + "", + "", + "class Nulla:", + 'tortor = "ultrices quis porta in"', + 'sagittis = "ut tellus"', + "", + "def pulvinar(self, blandit, metus):", + "egestas = [mauris for mauris in zip(blandit, metus)]", + "neque = (egestas, blandit)", + "", + "def similar_function_5_lines(self, similar):", + "some_var = 10", + "someother_var *= 300", + 'fusce = "sit"', + 'amet = "tortor"', + 'iaculis = "dolor"', + "return some_var, someother_var, fusce, amet, iaculis, iaculis", + "", + "", + "def tortor(self):", + "ultrices = 2", + 'quis = ultricies * "porta"', + "return ultricies, quis", + "", + "", + "class Commodo:", + "def similar_function_3_lines(self, tellus):", + "agittis = 10", + "tellus *= 300", + 'laoreet = "commodo "', + "return agittis, tellus, laoreet", + ), + ) + + data = [] + + # Manually perform a 'map' type function + for source_fname in source_streams: + sim = similar.SimilarChecker(linter) + with open(source_fname) as stream: + sim.append_stream(source_fname, stream) + # The map bit, can you tell? ;) + data.extend(sim.get_map_data()) + + assert len(expected_linelists) == len(data) + for source_fname, expected_lines, lineset_obj in zip( + source_streams, expected_linelists, data + ): + assert source_fname == lineset_obj.name + # There doesn't seem to be a faster way of doing this, yet. + lines = (line for idx, line in lineset_obj.enumerate_stripped()) + assert tuple(expected_lines) == tuple(lines) diff --git a/tests/input/similar_lines_a.py b/tests/input/similar_lines_a.py new file mode 100644 index 0000000000..65a72a79d5 --- /dev/null +++ b/tests/input/similar_lines_a.py @@ -0,0 +1,63 @@ +""" A file designed to have lines of similarity when compared to similar_lines_b + +We use lorm-ipsum to generate 'random' code. """ +# Copyright (c) 2020 Frank Harrison + + +def adipiscing(elit): + etiam = "id" + dictum = "purus," + vitae = "pretium" + neque = "Vivamus" + nec = "ornare" + tortor = "sit" + return etiam, dictum, vitae, neque, nec, tortor + + +class Amet: + def similar_function_3_lines(self, tellus): # line same #1 + agittis = 10 # line same #2 + tellus *= 300 # line same #3 + return agittis, tellus # line diff + + def lorem(self, ipsum): + dolor = "sit" + amet = "consectetur" + return (lorem, dolor, amet) + + def similar_function_5_lines(self, similar): # line same #1 + some_var = 10 # line same #2 + someother_var *= 300 # line same #3 + fusce = "sit" # line same #4 + amet = "tortor" # line same #5 + return some_var, someother_var, fusce, amet # line diff + + def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"): + metus = "ut" + lobortis = "urna." + Integer = "nisl" + (mauris,) = "interdum" + non = "odio" + semper = "aliquam" + malesuada = "nunc." + iaculis = "dolor" + facilisis = "ultrices" + vitae = "ut." + + return ( + metus, + lobortis, + Integer, + mauris, + non, + semper, + malesuada, + iaculis, + facilisis, + vitae, + ) + + def similar_function_3_lines(self, tellus): # line same #1 + agittis = 10 # line same #2 + tellus *= 300 # line same #3 + return agittis, tellus # line diff diff --git a/tests/input/similar_lines_b.py b/tests/input/similar_lines_b.py new file mode 100644 index 0000000000..21634883d8 --- /dev/null +++ b/tests/input/similar_lines_b.py @@ -0,0 +1,36 @@ +""" The sister file of similar_lines_a, another file designed to have lines of +similarity when compared to its sister file + +As with the sister file, we use lorm-ipsum to generate 'random' code. """ +# Copyright (c) 2020 Frank Harrison + + +class Nulla: + tortor = "ultrices quis porta in" + sagittis = "ut tellus" + + def pulvinar(self, blandit, metus): + egestas = [mauris for mauris in zip(blandit, metus)] + neque = (egestas, blandit) + + def similar_function_5_lines(self, similar): # line same #1 + some_var = 10 # line same #2 + someother_var *= 300 # line same #3 + fusce = "sit" # line same #4 + amet = "tortor" # line same #5 + iaculis = "dolor" # line diff + return some_var, someother_var, fusce, amet, iaculis, iaculis # line diff + + +def tortor(self): + ultrices = 2 + quis = ultricies * "porta" + return ultricies, quis + + +class Commodo: + def similar_function_3_lines(self, tellus): # line same #1 + agittis = 10 # line same #2 + tellus *= 300 # line same #3 + laoreet = "commodo " # line diff + return agittis, tellus, laoreet # line diff From 6653dd72b8eda680d423465ab336c2f4ca2fe2a4 Mon Sep 17 00:00:00 2001 From: Frank Harrison Date: Thu, 26 Mar 2020 11:41:22 +0000 Subject: [PATCH 3/3] mapreduce| Fixes -jN for map/reduce Checkers (e.g. SimilarChecker) This integrate the map/reduce functionality into lint.check_process(). We previously had `map` being invoked, here we add `reduce` support. We do this by collecting the map-data by worker and then passing it to a reducer function on the Checker object, if available - determined by whether they confirm to the `mapreduce_checker.MapReduceMixin` mixin interface or nor. This allows Checker objects to function across file-streams when using multiprocessing/-j2+. For example SimilarChecker needs to be able to compare data across all files. The tests, that we also add here, check that a Checker instance returns and reports expected data and errors, such as error-messages and stats - at least in a exit-ok (0) situation. On a personal note, as we are copying more data across process boundaries, I suspect that the memory implications of this might cause issues for large projects already running with -jN and duplicate code detection on. That said, given that it takes a long time to perform lints of large code bases that is an issue for the [near?] future and likely to be part of the performance work. Either way but let's get it working first and deal with memory and perforamnce considerations later - I say this as there are many quick wins we can make here, e.g. file-batching, hashing lines, data compression and so on. --- pylint/checkers/__init__.py | 10 ++++- pylint/checkers/mapreduce_checker.py | 18 +++++++++ pylint/checkers/similar.py | 4 +- pylint/lint/parallel.py | 55 +++++++++++++++++++++++----- tests/test_check_parallel.py | 27 +++++++++++--- tests/test_self.py | 12 ++++-- 6 files changed, 105 insertions(+), 21 deletions(-) create mode 100644 pylint/checkers/mapreduce_checker.py diff --git a/pylint/checkers/__init__.py b/pylint/checkers/__init__.py index bfde22b3d7..31d2df5226 100644 --- a/pylint/checkers/__init__.py +++ b/pylint/checkers/__init__.py @@ -10,6 +10,7 @@ # Copyright (c) 2018-2019 Pierre Sassoulas # Copyright (c) 2018 ssolanki # Copyright (c) 2019 Bruno P. Kinoshita +# Copyright (c) 2020 Frank Harrison # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html # For details: https://github.com/PyCQA/pylint/blob/master/COPYING @@ -43,6 +44,7 @@ """ from pylint.checkers.base_checker import BaseChecker, BaseTokenChecker +from pylint.checkers.mapreduce_checker import MapReduceMixin from pylint.utils import register_plugins @@ -64,4 +66,10 @@ def initialize(linter): register_plugins(linter, __path__[0]) -__all__ = ["BaseChecker", "BaseTokenChecker", "initialize", "register_plugins"] +__all__ = [ + "BaseChecker", + "BaseTokenChecker", + "initialize", + "MapReduceMixin", + "register_plugins", +] diff --git a/pylint/checkers/mapreduce_checker.py b/pylint/checkers/mapreduce_checker.py new file mode 100644 index 0000000000..17a1090bf2 --- /dev/null +++ b/pylint/checkers/mapreduce_checker.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 Frank Harrison + +# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html +# For details: https://github.com/PyCQA/pylint/blob/master/COPYING +import abc + + +class MapReduceMixin(metaclass=abc.ABCMeta): + """ A mixin design to allow multiprocess/threaded runs of a Checker """ + + @abc.abstractmethod + def get_map_data(self): + """ Returns mergable/reducible data that will be examined """ + + @classmethod + @abc.abstractmethod + def reduce_map_data(cls, linter, data): + """ For a given Checker, receives data for all mapped runs """ diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py index 3ac071bb3c..1f817ada25 100644 --- a/pylint/checkers/similar.py +++ b/pylint/checkers/similar.py @@ -31,7 +31,7 @@ import astroid -from pylint.checkers import BaseChecker, table_lines_from_stats +from pylint.checkers import BaseChecker, MapReduceMixin, table_lines_from_stats from pylint.interfaces import IRawChecker from pylint.reporters.ureports.nodes import Table from pylint.utils import decoding_stream @@ -302,7 +302,7 @@ def report_similarities(sect, stats, old_stats): # wrapper to get a pylint checker from the similar class -class SimilarChecker(BaseChecker, Similar): +class SimilarChecker(BaseChecker, Similar, MapReduceMixin): """checks for similarities and duplicated code. This computation may be memory / CPU intensive, so you should disable it if you experiment some problems. diff --git a/pylint/lint/parallel.py b/pylint/lint/parallel.py index fa1e65d8f2..fe9ce06051 100644 --- a/pylint/lint/parallel.py +++ b/pylint/lint/parallel.py @@ -67,28 +67,59 @@ def _worker_check_single_file(file_item): _worker_linter.open() _worker_linter.check_single_file(name, filepath, modname) - + mapreduce_data = collections.defaultdict(list) + for checker in _worker_linter.get_checkers(): + try: + data = checker.get_map_data() + except AttributeError: + continue + mapreduce_data[checker.name].append(data) msgs = [_get_new_args(m) for m in _worker_linter.reporter.messages] _worker_linter.reporter.reset() return ( + id(multiprocessing.current_process()), _worker_linter.current_name, filepath, _worker_linter.file_state.base_name, msgs, _worker_linter.stats, _worker_linter.msg_status, + mapreduce_data, ) +def _merge_mapreduce_data(linter, all_mapreduce_data): + """ Merges map/reduce data across workers, invoking relevant APIs on checkers """ + # First collate the data, preparing it so we can send it to the checkers for + # validation. The intent here is to collect all the mapreduce data for all checker- + # runs across processes - that will then be passed to a static method on the + # checkers to be reduced and further processed. + collated_map_reduce_data = collections.defaultdict(list) + for linter_data in all_mapreduce_data.values(): + for run_data in linter_data: + for checker_name, data in run_data.items(): + collated_map_reduce_data[checker_name].extend(data) + + # Send the data to checkers that support/require consolidated data + original_checkers = linter.get_checkers() + for checker in original_checkers: + if checker.name in collated_map_reduce_data: + # Assume that if the check has returned map/reduce data that it has the + # reducer function + checker.reduce_map_data(linter, collated_map_reduce_data[checker.name]) + + def check_parallel(linter, jobs, files, arguments=None): - """Use the given linter to lint the files with given amount of workers (jobs)""" - # The reporter does not need to be passed to worker processess, i.e. the reporter does - # not need to be pickleable + """Use the given linter to lint the files with given amount of workers (jobs) + This splits the work filestream-by-filestream. If you need to do work across + multiple files, as in the similarity-checker, then inherit from MapReduceMixin and + implement the map/reduce mixin functionality""" + # The reporter does not need to be passed to worker processes, i.e. the reporter does original_reporter = linter.reporter linter.reporter = None # The linter is inherited by all the pool's workers, i.e. the linter - # is identical to the linter object here. This is requred so that + # is identical to the linter object here. This is required so that # a custom PyLinter object can be used. initializer = functools.partial(_worker_initialize, arguments=arguments) pool = multiprocessing.Pool(jobs, initializer=initializer, initargs=[linter]) @@ -97,30 +128,36 @@ def check_parallel(linter, jobs, files, arguments=None): # correct reporter linter.set_reporter(original_reporter) linter.open() - - all_stats = [] - try: + all_stats = [] + all_mapreduce_data = collections.defaultdict(list) + + # Maps each file to be worked on by a single _worker_check_single_file() call, + # collecting any map/reduce data by checker module so that we can 'reduce' it + # later. for ( + worker_idx, # used to merge map/reduce data across workers module, file_path, base_name, messages, stats, msg_status, + mapreduce_data, ) in pool.imap_unordered(_worker_check_single_file, files): linter.file_state.base_name = base_name linter.set_current_module(module, file_path) for msg in messages: msg = Message(*msg) linter.reporter.handle_message(msg) - all_stats.append(stats) + all_mapreduce_data[worker_idx].append(mapreduce_data) linter.msg_status |= msg_status finally: pool.close() pool.join() + _merge_mapreduce_data(linter, all_mapreduce_data) linter.stats = _merge_stats(all_stats) # Insert stats data to local checkers. diff --git a/tests/test_check_parallel.py b/tests/test_check_parallel.py index e8f67f4b65..c45b0b3b9d 100644 --- a/tests/test_check_parallel.py +++ b/tests/test_check_parallel.py @@ -103,9 +103,17 @@ def test_worker_check_single_file_uninitialised(self): def test_worker_check_single_file_no_checkers(self): linter = PyLinter(reporter=Reporter()) worker_initialize(linter=linter) - (name, _, _, msgs, stats, msg_status) = worker_check_single_file( - _gen_file_data() - ) + + ( + _, # proc-id + name, + _, # file_path + _, # base_name + msgs, + stats, + msg_status, + _, # mapreduce_data + ) = worker_check_single_file(_gen_file_data()) assert name == "--test-file_data-name-0--" assert [] == msgs no_errors_status = 0 @@ -140,9 +148,16 @@ def test_worker_check_sequential_checker(self): # Add the only checker we care about in this test linter.register_checker(SequentialTestChecker(linter)) - (name, _, _, msgs, stats, msg_status) = worker_check_single_file( - _gen_file_data() - ) + ( + _, # proc-id + name, + _, # file_path + _, # base_name + msgs, + stats, + msg_status, + _, # mapreduce_data + ) = worker_check_single_file(_gen_file_data()) # Ensure we return the same data as the single_file_no_checkers test assert name == "--test-file_data-name-0--" diff --git a/tests/test_self.py b/tests/test_self.py index cd138e7892..e8f9f848ff 100644 --- a/tests/test_self.py +++ b/tests/test_self.py @@ -46,7 +46,7 @@ import pytest -from pylint.constants import MAIN_CHECKER_NAME +from pylint.constants import MAIN_CHECKER_NAME, MSG_TYPES_STATUS from pylint.lint import Run from pylint.reporters import JSONReporter from pylint.reporters.text import BaseReporter, ColorizedTextReporter, TextReporter @@ -243,13 +243,19 @@ def test_no_out_encoding(self): ) def test_parallel_execution(self): + out = StringIO() self._runtest( [ "-j 2", join(HERE, "functional", "a", "arguments.py"), - join(HERE, "functional", "a", "arguments.py"), ], - code=2, + out=out, + # We expect similarities to fail and an error + code=MSG_TYPES_STATUS["E"], + ) + assert ( + "Unexpected keyword argument 'fourth' in function call" + in out.getvalue().strip() ) def test_parallel_execution_missing_arguments(self):