datafold · sirupsen · Jun 22, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 22, 2022
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -1,5 +1,6 @@
 import sys
 import time
+import json
 import logging
 from itertools import islice
 
@@ -50,6 +51,7 @@
 @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
 @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
 @click.option("-d", "--debug", is_flag=True, help="Print debug info")
+@click.option("--json", 'json_output', is_flag=True, help="Print JSONL output for machine readability")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
 @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
@@ -80,6 +82,7 @@ def main(
     interactive,
     threads,
     keep_column_case,
+    json_output,
 ):
     if limit and stats:
         print("Error: cannot specify a limit when using the -s/--stats switch")
@@ -144,17 +147,35 @@ def main(
     if stats:
         diff = list(diff_iter)
         unique_diff_count = len({i[0] for _, i in diff})
-        table1_count = differ.stats.get("table1_count")
-        percent = 100 * unique_diff_count / (table1_count or 1)
-        print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
-        print(f"Diff-Percent: {percent:.4f}%")
+        max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
+        percent = 100 * unique_diff_count / (max_table_count or 1)
         plus = len([1 for op, _ in diff if op == "+"])
         minus = len([1 for op, _ in diff if op == "-"])
-        print(f"Diff-Split: +{plus}  -{minus}")
+
+        if json_output:
+            json_output = {
+                "different_rows": len(diff),
+                "different_percent": percent,
+                "different_+": plus,
+                "different_-": minus,
+                "total": max_table_count,
+            }
+            print(json.dumps(json_output))
+        else:
+            print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
+            print(f"Diff-Percent: {percent:.14f}%")
+            print(f"Diff-Split: +{plus}  -{minus}")
     else:
-        for op, key in diff_iter:
+        for op, columns in diff_iter:
             color = COLOR_SCHEME[op]
-            rich.print(f"[{color}]{op} {key!r}[/{color}]")
+
+            if json_output:
+                jsonl = json.dumps([op, list(columns)])
+                rich.print(f"[{color}]{jsonl}[/{color}]")
+            else:
+                text = f"{op} {', '.join(columns)}"
+                rich.print(f"[{color}]{text}[/{color}]")
+
             sys.stdout.flush()
 
     end = time.time()

diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -381,6 +381,14 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
         if max_rows < self.bisection_threshold:
             rows1, rows2 = self._threaded_call("get_values", [table1, table2])
             diff = list(diff_sets(rows1, rows2))
+
+            # Initial bisection_threshold larger than count. Normally we always
+            # checksum and count segments, even if we get the values. At the
+            # first level, however, that won't be true.
+            if level == 0:
+                self.stats["table1_count"] = len(rows1)
+                self.stats["table2_count"] = len(rows2)
+
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
             yield from diff
@@ -421,6 +429,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
 
         if level == 1:
             self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
+            self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2
 
         if checksum1 != checksum2:
             yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))

diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
@@ -155,6 +155,8 @@ def test_diff_small_tables(self):
         diff = list(self.differ.diff_tables(self.table, self.table2))
         expected = [("-", ("2", time + ".000000"))]
         self.assertEqual(expected, diff)
+        self.assertEqual(2, self.differ.stats["table1_count"])
+        self.assertEqual(1, self.differ.stats["table2_count"])
 
     def test_diff_table_above_bisection_threshold(self):
         time = "2022-01-01 00:00:00"
@@ -176,6 +178,8 @@ def test_diff_table_above_bisection_threshold(self):
         diff = list(self.differ.diff_tables(self.table, self.table2))
         expected = [("-", ("5", time + ".000000"))]
         self.assertEqual(expected, diff)
+        self.assertEqual(5, self.differ.stats["table1_count"])
+        self.assertEqual(4, self.differ.stats["table2_count"])
 
     def test_return_empty_array_when_same(self):
         time = "2022-01-01 00:00:00"