mapreduce| Fixes -jN for map/reduce Checkers (e.g. SimilarChecker)

doublethefish · doublethefish · commit bcfd6e7b7388 · 2020-04-08T10:54:34.000+01:00
This integrate the map/reduce functionality into lint.check_process().

We previously had `map` being invoked, here we add `reduce` support.

We do this by collecting the map-data by worker and then passing it to a
reducer function on the Checker object, if available - determined by
whether they confirm to the `mapreduce_checker.MapReduceMixin` mixin
interface or nor.

This allows Checker objects to function across file-streams when using
multiprocessing/-j2+. For example SimilarChecker needs to be able to
compare data across all files.

The tests, that we also add here, check that a Checker instance returns
and reports expected data and errors, such as error-messages and stats -
at least in a exit-ok (0) situation.

On a personal note, as we are copying more data across process
boundaries, I suspect that the memory implications of this might cause
issues for large projects already running with -jN and duplicate code
detection on. That said, given that it takes a long time to perform
lints of large code bases that is an issue for the [near?] future and
likely to be part of the performance work. Either way but let's get it
working first and deal with memory and perforamnce considerations later
- I say this as there are many quick wins we can make here, e.g.
file-batching, hashing lines, data compression and so on.
diff --git a/pylint/checkers/__init__.py b/pylint/checkers/__init__.py
@@ -7,6 +7,7 @@
 # Copyright (c) 2015 Ionel Cristian Maries <contact@ionelmc.ro>
 # Copyright (c) 2016 Moises Lopez <moylop260@vauxoo.com>
 # Copyright (c) 2017-2018 Bryce Guinta <bryce.paul.guinta@gmail.com>
+# Copyright (c) 2020 Frank Harrison <doublethefish@gmail.com>
 
 # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 # For details: https://github.com/PyCQA/pylint/blob/master/COPYING
@@ -40,6 +41,7 @@
 """
 
 from pylint.checkers.base_checker import BaseChecker, BaseTokenChecker
+from pylint.checkers.mapreduce_checker import MapReduceMixin
 from pylint.utils import register_plugins
 
 
diff --git a/pylint/checkers/mapreduce_checker.py b/pylint/checkers/mapreduce_checker.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 Frank Harrison <doublethefish@gmail.com>
+
+# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
+# For details: https://github.com/PyCQA/pylint/blob/master/COPYING
+import abc
+
+
+class MapReduceMixin(metaclass=abc.ABCMeta):
+    """ A mixin design to allow multiprocess/threaded runs of a Checker """
+
+    @abc.abstractmethod
+    def get_map_data(self):
+        """ Returns mergable/reducible data that will be examined """
+
+    @classmethod
+    @abc.abstractmethod
+    def reduce_map_data(cls, linter, data):
+        """ For a given Checker, receives data for all mapped runs """
diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py
@@ -25,7 +25,7 @@
 
 import astroid
 
-from pylint.checkers import BaseChecker, table_lines_from_stats
+from pylint.checkers import BaseChecker, MapReduceMixin, table_lines_from_stats
 from pylint.interfaces import IRawChecker
 from pylint.reporters.ureports.nodes import Table
 from pylint.utils import decoding_stream
@@ -292,7 +292,7 @@ def report_similarities(sect, stats, old_stats):
 
 
 # wrapper to get a pylint checker from the similar class
-class SimilarChecker(BaseChecker, Similar):
+class SimilarChecker(BaseChecker, Similar, MapReduceMixin):
     """checks for similarities and duplicated code. This computation may be
     memory / CPU intensive, so you should disable it if you experiment some
     problems.
diff --git a/pylint/lint.py b/pylint/lint.py
@@ -39,6 +39,7 @@
 # Copyright (c) 2018 Yuval Langer <yuvallanger@mail.tau.ac.il>
 # Copyright (c) 2018 Nick Drozd <nicholasdrozd@gmail.com>
 # Copyright (c) 2018 kapsh <kapsh@kap.sh>
+# Copyright (c) 2020 Frank Harrison <doublethefish@gmail.com>
 
 # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 # For details: https://github.com/PyCQA/pylint/blob/master/COPYING
@@ -1280,14 +1281,18 @@ def _report_evaluation(self):
 
 
 def check_parallel(linter, jobs, files, arguments=None):
-    """Use the given linter to lint the files with given amount of workers (jobs)"""
-    # The reporter does not need to be passed to worker processess, i.e. the reporter does
+    """Use the given linter to lint the files with given amount of workers (jobs)
+
+    This splits the work filestream-by-filestream. If you need to do work across
+    multiple files, as in the similarity-checker, then inherit from MapReduceMixin and
+    implement the map/reduce mixin functionality """
+    # The reporter does not need to be passed to worker processes, i.e. the reporter does
     # not need to be pickleable
     original_reporter = linter.reporter
     linter.reporter = None
 
     # The linter is inherited by all the pool's workers, i.e. the linter
-    # is identical to the linter object here. This is requred so that
+    # is identical to the linter object here. This is required so that
     # a custom PyLinter object can be used.
     initializer = functools.partial(_worker_initialize, arguments=arguments)
     with multiprocessing.Pool(jobs, initializer=initializer, initargs=[linter]) as pool:
@@ -1298,19 +1303,29 @@ def check_parallel(linter, jobs, files, arguments=None):
         linter.open()
 
         all_stats = []
-
-        # Maps each file to be worked on by a single _worker_check_single_file() call
-        for module, messages, stats, msg_status in pool.imap_unordered(
-            _worker_check_single_file, files
-        ):
+        all_mapreduce_data = collections.defaultdict(list)
+
+        # Maps each file to be worked on by a single _worker_check_single_file() call,
+        # collecting any map/reduce data by checker module so that we can 'reduce' it
+        # later.
+        for (
+            worker_idx,  # used to merge map/reduce data across workers
+            module,
+            messages,
+            stats,
+            msg_status,
+            mapreduce_data,
+        ) in pool.imap_unordered(_worker_check_single_file, files):
             linter.set_current_module(module)
             for msg in messages:
                 msg = Message(*msg)
                 linter.reporter.handle_message(msg)
 
             all_stats.append(stats)
+            all_mapreduce_data[worker_idx].append(mapreduce_data)
             linter.msg_status |= msg_status
 
+    _merge_mapreduce_data(linter, all_mapreduce_data)
     linter.stats = _merge_stats(all_stats)
 
     # Insert stats data to local checkers.
@@ -1343,15 +1358,46 @@ def _worker_check_single_file(file_item):
     _worker_linter.open()
     _worker_linter.check_single_file(name, filepath, modname)
 
+    mapreduce_data = collections.defaultdict(list)
+    for checker in _worker_linter.get_checkers():
+        try:
+            data = checker.get_map_data()
+        except AttributeError:
+            continue
+        mapreduce_data[checker.name].append(data)
+
     msgs = [_get_new_args(m) for m in _worker_linter.reporter.messages]
     return (
+        id(multiprocessing.current_process()),
         _worker_linter.current_name,
         msgs,
         _worker_linter.stats,
         _worker_linter.msg_status,
+        mapreduce_data,
     )
 
 
+def _merge_mapreduce_data(linter, all_mapreduce_data):
+    """ Merges map/reduce data across workers, invoking relevant APIs on checkers """
+    # First collate the data, preparing it so we can send it to the checkers for
+    # validation. The intent here is to collect all the mapreduce data for all checker-
+    # runs across processes - that will then be passed to a static method on the
+    # checkers to be reduced and further processed.
+    collated_map_reduce_data = collections.defaultdict(list)
+    for linter_data in all_mapreduce_data.values():
+        for run_data in linter_data:
+            for checker_name, data in run_data.items():
+                collated_map_reduce_data[checker_name].extend(data)
+
+    # Send the data to checkers that support/require consolidated data
+    original_checkers = linter.get_checkers()
+    for checker in original_checkers:
+        if checker.name in collated_map_reduce_data:
+            # Assume that if the check has returned map/reduce data that it has the
+            # reducer function
+            checker.reduce_map_data(linter, collated_map_reduce_data[checker.name])
+
+
 # some reporting functions ####################################################
 
 
diff --git a/tests/test_check_parallel.py b/tests/test_check_parallel.py
diff --git a/tests/test_self.py b/tests/test_self.py

-Original file line number
+Diff line change
 # Copyright (c) 2015 Ionel Cristian Maries <[email protected]>
 # Copyright (c) 2016 Moises Lopez <[email protected]>
 # Copyright (c) 2017-2018 Bryce Guinta <[email protected]>
 +# Copyright (c) 2020 Frank Harrison <[email protected]>
 # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 # For details: https://github.com/PyCQA/pylint/blob/master/COPYING
 """
 from pylint.checkers.base_checker import BaseChecker, BaseTokenChecker
 +from pylint.checkers.mapreduce_checker import MapReduceMixin
 from pylint.utils import register_plugins