From f09a86625990ec466c1e9a2ff4a2e32c5881aee5 Mon Sep 17 00:00:00 2001
From: Simon Eskildsen <sirup@sirupsen.com>
Date: Tue, 21 Jun 2022 13:47:50 -0400
Subject: [PATCH 1/3] cli: output diff as jsonl, stats as json

---
 data_diff/__main__.py    | 18 ++++++++++++++----
 data_diff/diff_tables.py |  8 +++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
index 5b357153..93ccf231 100644
--- a/data_diff/__main__.py
+++ b/data_diff/__main__.py
@@ -1,5 +1,6 @@
 import sys
 import time
+import json
 import logging
 from itertools import islice
 
@@ -146,15 +147,24 @@ def main(
         unique_diff_count = len({i[0] for _, i in diff})
         table1_count = differ.stats.get("table1_count")
         percent = 100 * unique_diff_count / (table1_count or 1)
-        print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
-        print(f"Diff-Percent: {percent:.4f}%")
         plus = len([1 for op, _ in diff if op == "+"])
         minus = len([1 for op, _ in diff if op == "-"])
-        print(f"Diff-Split: +{plus}  -{minus}")
+
+        count = differ.stats["table_count"]
+        diff = {
+            "different_rows": len(diff),
+            "different_percent": percent,
+            "different_+": plus,
+            "different_-": minus,
+            "total": count,
+        }
+
+        print(json.dumps(diff, indent=2))
     else:
         for op, key in diff_iter:
             color = COLOR_SCHEME[op]
-            rich.print(f"[{color}]{op} {key!r}[/{color}]")
+            jsonl = json.dumps([op, list(key)])
+            rich.print(f"[{color}]{jsonl}[/{color}]")
             sys.stdout.flush()
 
     end = time.time()
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
index 4087b49d..05d082fa 100644
--- a/data_diff/diff_tables.py
+++ b/data_diff/diff_tables.py
@@ -381,6 +381,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
         if max_rows < self.bisection_threshold:
             rows1, rows2 = self._threaded_call("get_values", [table1, table2])
             diff = list(diff_sets(rows1, rows2))
+
+            # This happens when the initial bisection threshold is larger than
+            # the table itself.
+            if level == 0 and not self.stats.get("table_count", False):
+                self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2))
+
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
             yield from diff
@@ -420,7 +426,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             return
 
         if level == 1:
-            self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
+            self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2)
 
         if checksum1 != checksum2:
             yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))

From 94d1419091ee4c6266a4ea66713ff8ee280d9376 Mon Sep 17 00:00:00 2001
From: Simon Eskildsen <sirup@sirupsen.com>
Date: Tue, 21 Jun 2022 14:36:24 -0400
Subject: [PATCH 2/3] cli: add --json for stats, table1 + table2 counts

---
 data_diff/__main__.py    | 29 +++++++++++++++++------------
 data_diff/diff_tables.py |  8 +++++---
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
index 93ccf231..fc32899e 100644
--- a/data_diff/__main__.py
+++ b/data_diff/__main__.py
@@ -51,6 +51,7 @@
 @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
 @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
 @click.option("-d", "--debug", is_flag=True, help="Print debug info")
+@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
 @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
@@ -81,6 +82,7 @@ def main(
     interactive,
     threads,
     keep_column_case,
+    json_output,
 ):
     if limit and stats:
         print("Error: cannot specify a limit when using the -s/--stats switch")
@@ -145,21 +147,24 @@ def main(
     if stats:
         diff = list(diff_iter)
         unique_diff_count = len({i[0] for _, i in diff})
-        table1_count = differ.stats.get("table1_count")
-        percent = 100 * unique_diff_count / (table1_count or 1)
+        max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
+        percent = 100 * unique_diff_count / (max_table_count or 1)
         plus = len([1 for op, _ in diff if op == "+"])
         minus = len([1 for op, _ in diff if op == "-"])
 
-        count = differ.stats["table_count"]
-        diff = {
-            "different_rows": len(diff),
-            "different_percent": percent,
-            "different_+": plus,
-            "different_-": minus,
-            "total": count,
-        }
-
-        print(json.dumps(diff, indent=2))
+        if json_output:
+            json_output = {
+                "different_rows": len(diff),
+                "different_percent": percent,
+                "different_+": plus,
+                "different_-": minus,
+                "total": max_table_count,
+            }
+            print(json.dumps(json_output, indent=2))
+        else:
+            print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
+            print(f"Diff-Percent: {percent:.14f}%")
+            print(f"Diff-Split: +{plus}  -{minus}")
     else:
         for op, key in diff_iter:
             color = COLOR_SCHEME[op]
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
index 05d082fa..48099c8f 100644
--- a/data_diff/diff_tables.py
+++ b/data_diff/diff_tables.py
@@ -384,8 +384,9 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
 
             # This happens when the initial bisection threshold is larger than
             # the table itself.
-            if level == 0 and not self.stats.get("table_count", False):
-                self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2))
+            if level == 0 and not self.stats.get("table1_count", False):
+                self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1)
+                self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2)
 
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
@@ -426,7 +427,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             return
 
         if level == 1:
-            self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2)
+            self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1
+            self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2
 
         if checksum1 != checksum2:
             yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))

From 37b47a0bdde598ce3c22a1e4f76e79c68c994dc4 Mon Sep 17 00:00:00 2001
From: Simon Eskildsen <sirup@sirupsen.com>
Date: Wed, 22 Jun 2022 09:44:57 -0400
Subject: [PATCH 3/3] cli: only json from standard diff with --json

---
 data_diff/__main__.py     | 16 +++++++++++-----
 data_diff/diff_tables.py  | 15 ++++++++-------
 tests/test_diff_tables.py |  4 ++++
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
index fc32899e..6ee6992c 100644
--- a/data_diff/__main__.py
+++ b/data_diff/__main__.py
@@ -51,7 +51,7 @@
 @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
 @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
 @click.option("-d", "--debug", is_flag=True, help="Print debug info")
-@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats")
+@click.option("--json", 'json_output', is_flag=True, help="Print JSONL output for machine readability")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
 @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
@@ -160,16 +160,22 @@ def main(
                 "different_-": minus,
                 "total": max_table_count,
             }
-            print(json.dumps(json_output, indent=2))
+            print(json.dumps(json_output))
         else:
             print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
             print(f"Diff-Percent: {percent:.14f}%")
             print(f"Diff-Split: +{plus}  -{minus}")
     else:
-        for op, key in diff_iter:
+        for op, columns in diff_iter:
             color = COLOR_SCHEME[op]
-            jsonl = json.dumps([op, list(key)])
-            rich.print(f"[{color}]{jsonl}[/{color}]")
+
+            if json_output:
+                jsonl = json.dumps([op, list(columns)])
+                rich.print(f"[{color}]{jsonl}[/{color}]")
+            else:
+                text = f"{op} {', '.join(columns)}"
+                rich.print(f"[{color}]{text}[/{color}]")
+
             sys.stdout.flush()
 
     end = time.time()
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
index 48099c8f..ddad1ef1 100644
--- a/data_diff/diff_tables.py
+++ b/data_diff/diff_tables.py
@@ -382,11 +382,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
             rows1, rows2 = self._threaded_call("get_values", [table1, table2])
             diff = list(diff_sets(rows1, rows2))
 
-            # This happens when the initial bisection threshold is larger than
-            # the table itself.
-            if level == 0 and not self.stats.get("table1_count", False):
-                self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1)
-                self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2)
+            # Initial bisection_threshold larger than count. Normally we always
+            # checksum and count segments, even if we get the values. At the
+            # first level, however, that won't be true.
+            if level == 0:
+                self.stats["table1_count"] = len(rows1)
+                self.stats["table2_count"] = len(rows2)
 
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
@@ -427,8 +428,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             return
 
         if level == 1:
-            self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1
-            self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2
+            self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
+            self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2
 
         if checksum1 != checksum2:
             yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
index a457081d..f2c9da19 100644
--- a/tests/test_diff_tables.py
+++ b/tests/test_diff_tables.py
@@ -155,6 +155,8 @@ def test_diff_small_tables(self):
         diff = list(self.differ.diff_tables(self.table, self.table2))
         expected = [("-", ("2", time + ".000000"))]
         self.assertEqual(expected, diff)
+        self.assertEqual(2, self.differ.stats["table1_count"])
+        self.assertEqual(1, self.differ.stats["table2_count"])
 
     def test_diff_table_above_bisection_threshold(self):
         time = "2022-01-01 00:00:00"
@@ -176,6 +178,8 @@ def test_diff_table_above_bisection_threshold(self):
         diff = list(self.differ.diff_tables(self.table, self.table2))
         expected = [("-", ("5", time + ".000000"))]
         self.assertEqual(expected, diff)
+        self.assertEqual(5, self.differ.stats["table1_count"])
+        self.assertEqual(4, self.differ.stats["table2_count"])
 
     def test_return_empty_array_when_same(self):
         time = "2022-01-01 00:00:00"