refactor: use sets to collect data

nedbat · nedbat · commit db235732dd9a · 2021-08-15T08:26:36.000-04:00
Coverage.py predates sets as a built-in data structure, so the file data
collection has long been dicts with None as the values.  Sets are
available to us now (since Python 2.4 in 2004, which coverage.py dropped
support for in 2014!), we use sets.
diff --git a/coverage/ctracer/datastack.h b/coverage/ctracer/datastack.h
@@ -12,7 +12,7 @@
  * possible.
  */
 typedef struct DataStackEntry {
-    /* The current file_data dictionary. Owned. */
+    /* The current file_data set. Owned. */
     PyObject * file_data;
 
     /* The disposition object for this frame. A borrowed instance of CFileDisposition. */
diff --git a/coverage/ctracer/tracer.c b/coverage/ctracer/tracer.c
@@ -182,7 +182,7 @@ CTracer_record_pair(CTracer *self, int l1, int l2)
         goto error;
     }
 
-    if (PyDict_SetItem(self->pcur_entry->file_data, t, Py_None) < 0) {
+    if (PySet_Add(self->pcur_entry->file_data, t) < 0) {
         goto error;
     }
 
@@ -504,7 +504,7 @@ CTracer_handle_call(CTracer *self, PyFrameObject *frame)
             if (PyErr_Occurred()) {
                 goto error;
             }
-            file_data = PyDict_New();
+            file_data = PySet_New(NULL);
             if (file_data == NULL) {
                 goto error;
             }
@@ -674,7 +674,7 @@ CTracer_handle_line(CTracer *self, PyFrameObject *frame)
                             goto error;
                         }
 
-                        ret2 = PyDict_SetItem(self->pcur_entry->file_data, this_line, Py_None);
+                        ret2 = PySet_Add(self->pcur_entry->file_data, this_line);
                         Py_DECREF(this_line);
                         if (ret2 < 0) {
                             goto error;
diff --git a/coverage/ctracer/tracer.h b/coverage/ctracer/tracer.h
@@ -39,15 +39,14 @@ typedef struct CTracer {
     PyObject * context;
 
     /*
-        The data stack is a stack of dictionaries.  Each dictionary collects
+        The data stack is a stack of sets.  Each set collects
         data for a single source file.  The data stack parallels the call stack:
         each call pushes the new frame's file data onto the data stack, and each
         return pops file data off.
 
-        The file data is a dictionary whose form depends on the tracing options.
-        If tracing arcs, the keys are line number pairs.  If not tracing arcs,
-        the keys are line numbers.  In both cases, the value is irrelevant
-        (None).
+        The file data is a set whose form depends on the tracing options.
+        If tracing arcs, the values are line number pairs.  If not tracing arcs,
+        the values are line numbers.
     */
 
     DataStack data_stack;           /* Used if we aren't doing concurrency. */
diff --git a/coverage/pytracer.py b/coverage/pytracer.py
@@ -48,7 +48,7 @@ def __init__(self):
         # The threading module to use, if any.
         self.threading = None
 
-        self.cur_file_dict = None
+        self.cur_file_data = None
         self.last_line = 0          # int, but uninitialized.
         self.cur_file_name = None
         self.context = None
@@ -113,18 +113,18 @@ def _trace(self, frame, event, arg_unused):
                     self.log(">", f.f_code.co_filename, f.f_lineno, f.f_code.co_name, f.f_trace)
                     f = f.f_back
             sys.settrace(None)
-            self.cur_file_dict, self.cur_file_name, self.last_line, self.started_context = (
+            self.cur_file_data, self.cur_file_name, self.last_line, self.started_context = (
                 self.data_stack.pop()
             )
             return None
 
         if self.last_exc_back:
             if frame == self.last_exc_back:
                 # Someone forgot a return event.
-                if self.trace_arcs and self.cur_file_dict:
+                if self.trace_arcs and self.cur_file_data:
                     pair = (self.last_line, -self.last_exc_firstlineno)
-                    self.cur_file_dict[pair] = None
-                self.cur_file_dict, self.cur_file_name, self.last_line, self.started_context = (
+                    self.cur_file_data.add(pair)
+                self.cur_file_data, self.cur_file_name, self.last_line, self.started_context = (
                     self.data_stack.pop()
                 )
             self.last_exc_back = None
@@ -150,7 +150,7 @@ def _trace(self, frame, event, arg_unused):
             self._activity = True
             self.data_stack.append(
                 (
-                    self.cur_file_dict,
+                    self.cur_file_data,
                     self.cur_file_name,
                     self.last_line,
                     self.started_context,
@@ -163,12 +163,12 @@ def _trace(self, frame, event, arg_unused):
                 disp = self.should_trace(filename, frame)
                 self.should_trace_cache[filename] = disp
 
-            self.cur_file_dict = None
+            self.cur_file_data = None
             if disp.trace:
                 tracename = disp.source_filename
                 if tracename not in self.data:
-                    self.data[tracename] = {}
-                self.cur_file_dict = self.data[tracename]
+                    self.data[tracename] = set()
+                self.cur_file_data = self.data[tracename]
             # The call event is really a "start frame" event, and happens for
             # function calls and re-entering generators.  The f_lasti field is
             # -1 for calls, and a real offset for generators.  Use <0 as the
@@ -179,25 +179,25 @@ def _trace(self, frame, event, arg_unused):
                 self.last_line = frame.f_lineno
         elif event == 'line':
             # Record an executed line.
-            if self.cur_file_dict is not None:
+            if self.cur_file_data is not None:
                 lineno = frame.f_lineno
 
                 if self.trace_arcs:
-                    self.cur_file_dict[(self.last_line, lineno)] = None
+                    self.cur_file_data.add((self.last_line, lineno))
                 else:
-                    self.cur_file_dict[lineno] = None
+                    self.cur_file_data.add(lineno)
                 self.last_line = lineno
         elif event == 'return':
-            if self.trace_arcs and self.cur_file_dict:
+            if self.trace_arcs and self.cur_file_data:
                 # Record an arc leaving the function, but beware that a
                 # "return" event might just mean yielding from a generator.
                 # Jython seems to have an empty co_code, so just assume return.
                 code = frame.f_code.co_code
                 if (not code) or code[frame.f_lasti] != YIELD_VALUE:
                     first = frame.f_code.co_firstlineno
-                    self.cur_file_dict[(self.last_line, -first)] = None
+                    self.cur_file_data.add((self.last_line, -first))
             # Leaving this function, pop the filename stack.
-            self.cur_file_dict, self.cur_file_name, self.last_line, self.started_context = (
+            self.cur_file_data, self.cur_file_name, self.last_line, self.started_context = (
                 self.data_stack.pop()
             )
             # Leaving a context?
diff --git a/coverage/sqldata.py b/coverage/sqldata.py
@@ -450,9 +450,9 @@ def data_filename(self):
     def add_lines(self, line_data):
         """Add measured line data.
 
-        `line_data` is a dictionary mapping file names to dictionaries::
+        `line_data` is a dictionary mapping file names to iterables of ints::
 
-            { filename: { lineno: None, ... }, ...}
+            { filename: { line1, line2, ... }, ...}
 
         """
         if self._debug.should('dataop'):
@@ -483,9 +483,10 @@ def add_lines(self, line_data):
     def add_arcs(self, arc_data):
         """Add measured arc data.
 
-        `arc_data` is a dictionary mapping file names to dictionaries::
+        `arc_data` is a dictionary mapping file names to iterables of pairs of
+        ints::
 
-            { filename: { (l1,l2): None, ... }, ...}
+            { filename: { (l1,l2), (l1,l2), ... }, ...}
 
         """
         if self._debug.should('dataop'):
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -24,33 +24,24 @@
 
 
 LINES_1 = {
-    'a.py': {1: None, 2: None},
-    'b.py': {3: None},
+    'a.py': {1, 2},
+    'b.py': {3},
 }
 SUMMARY_1 = {'a.py': 2, 'b.py': 1}
 MEASURED_FILES_1 = ['a.py', 'b.py']
 A_PY_LINES_1 = [1, 2]
 B_PY_LINES_1 = [3]
 
 LINES_2 = {
-    'a.py': {1: None, 5: None},
-    'c.py': {17: None},
+    'a.py': {1, 5},
+    'c.py': {17},
 }
 SUMMARY_1_2 = {'a.py': 3, 'b.py': 1, 'c.py': 1}
 MEASURED_FILES_1_2 = ['a.py', 'b.py', 'c.py']
 
 ARCS_3 = {
-    'x.py': {
-        (-1, 1): None,
-        (1, 2): None,
-        (2, 3): None,
-        (3, -1): None,
-    },
-    'y.py': {
-        (-1, 17): None,
-        (17, 23): None,
-        (23, -1): None,
-    },
+    'x.py': {(-1, 1), (1, 2), (2, 3), (3, -1)},
+    'y.py': {(-1, 17), (17, 23), (23, -1)},
 }
 X_PY_ARCS_3 = [(-1, 1), (1, 2), (2, 3), (3, -1)]
 Y_PY_ARCS_3 = [(-1, 17), (17, 23), (23, -1)]
@@ -60,15 +51,8 @@
 Y_PY_LINES_3 = [17, 23]
 
 ARCS_4 = {
-    'x.py': {
-        (-1, 2): None,
-        (2, 5): None,
-        (5, -1): None,
-    },
-    'z.py': {
-        (-1, 1000): None,
-        (1000, -1): None,
-    },
+    'x.py': {(-1, 2), (2, 5), (5, -1)},
+    'z.py': {(-1, 1000), (1000, -1)},
 }
 SUMMARY_3_4 = {'x.py': 4, 'y.py': 2, 'z.py': 1}
 MEASURED_FILES_3_4 = ['x.py', 'y.py', 'z.py']
@@ -103,6 +87,16 @@ def assert_arcs3_data(self, covdata):
         assert covdata.has_arcs()
 
 
+def dicts_from_sets(file_data):
+    """Convert a dict of sets into a dict of dicts.
+
+    Before 6.0, file data was a dict with None as the values.  In 6.0, file
+    data is a set.  SqlData all along only cared that it was an iterable.
+    This function helps us test that the old dict format still works.
+    """
+    return {k: dict.fromkeys(v) for k, v in file_data.items()}
+
+
 class CoverageDataTest(DataTestHelpers, CoverageTest):
     """Test cases for CoverageData."""
 
@@ -130,14 +124,16 @@ def test_empty_arc_data_is_false(self):
         covdata.add_arcs({})
         assert not covdata
 
-    def test_adding_lines(self):
+    @pytest.mark.parametrize("lines", [LINES_1, dicts_from_sets(LINES_1)])
+    def test_adding_lines(self, lines):
         covdata = CoverageData()
-        covdata.add_lines(LINES_1)
+        covdata.add_lines(lines)
         self.assert_lines1_data(covdata)
 
-    def test_adding_arcs(self):
+    @pytest.mark.parametrize("arcs", [ARCS_3, dicts_from_sets(ARCS_3)])
+    def test_adding_arcs(self, arcs):
         covdata = CoverageData()
-        covdata.add_arcs(ARCS_3)
+        covdata.add_arcs(arcs)
         self.assert_arcs3_data(covdata)
 
     def test_ok_to_add_lines_twice(self):
@@ -212,20 +208,22 @@ def test_contexts_by_lineno_with_lines(self):
         covdata.add_lines(LINES_1)
         assert covdata.contexts_by_lineno('a.py') == {1: ['test_a'], 2: ['test_a']}
 
-    def test_no_duplicate_lines(self):
+    @pytest.mark.parametrize("lines", [LINES_1, dicts_from_sets(LINES_1)])
+    def test_no_duplicate_lines(self, lines):
         covdata = CoverageData()
         covdata.set_context("context1")
-        covdata.add_lines(LINES_1)
+        covdata.add_lines(lines)
         covdata.set_context("context2")
-        covdata.add_lines(LINES_1)
+        covdata.add_lines(lines)
         assert covdata.lines('a.py') == A_PY_LINES_1
 
-    def test_no_duplicate_arcs(self):
+    @pytest.mark.parametrize("arcs", [ARCS_3, dicts_from_sets(ARCS_3)])
+    def test_no_duplicate_arcs(self, arcs):
         covdata = CoverageData()
         covdata.set_context("context1")
-        covdata.add_arcs(ARCS_3)
+        covdata.add_arcs(arcs)
         covdata.set_context("context2")
-        covdata.add_arcs(ARCS_3)
+        covdata.add_arcs(arcs)
         assert covdata.arcs('x.py') == X_PY_ARCS_3
 
     def test_no_arcs_vs_unmeasured_file(self):

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ CTracer_record_pair(CTracer *self, int l1, int l2)`
`182`	`182`	`goto error;`
`183`	`183`	`}`
`184`	`184`
`185`		`- if (PyDict_SetItem(self->pcur_entry->file_data, t, Py_None) < 0) {`
	`185`	`+ if (PySet_Add(self->pcur_entry->file_data, t) < 0) {`
`186`	`186`	`goto error;`
`187`	`187`	`}`
`188`	`188`
`@@ -504,7 +504,7 @@ CTracer_handle_call(CTracer self, PyFrameObject frame)`
`504`	`504`	`if (PyErr_Occurred()) {`
`505`	`505`	`goto error;`
`506`	`506`	`}`
`507`		`- file_data = PyDict_New();`
	`507`	`+ file_data = PySet_New(NULL);`
`508`	`508`	`if (file_data == NULL) {`
`509`	`509`	`goto error;`
`510`	`510`	`}`
`@@ -674,7 +674,7 @@ CTracer_handle_line(CTracer self, PyFrameObject frame)`
`674`	`674`	`goto error;`
`675`	`675`	`}`
`676`	`676`
`677`		`- ret2 = PyDict_SetItem(self->pcur_entry->file_data, this_line, Py_None);`
	`677`	`+ ret2 = PySet_Add(self->pcur_entry->file_data, this_line);`
`678`	`678`	`Py_DECREF(this_line);`
`679`	`679`	`if (ret2 < 0) {`
`680`	`680`	`goto error;`