diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 5b357153..6ee6992c 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -1,5 +1,6 @@ import sys import time +import json import logging from itertools import islice @@ -50,6 +51,7 @@ @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.") @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff") @click.option("-d", "--debug", is_flag=True, help="Print debug info") +@click.option("--json", 'json_output', is_flag=True, help="Print JSONL output for machine readability") @click.option("-v", "--verbose", is_flag=True, help="Print extra info") @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug") @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.") @@ -80,6 +82,7 @@ def main( interactive, threads, keep_column_case, + json_output, ): if limit and stats: print("Error: cannot specify a limit when using the -s/--stats switch") @@ -144,17 +147,35 @@ def main( if stats: diff = list(diff_iter) unique_diff_count = len({i[0] for _, i in diff}) - table1_count = differ.stats.get("table1_count") - percent = 100 * unique_diff_count / (table1_count or 1) - print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}") - print(f"Diff-Percent: {percent:.4f}%") + max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"]) + percent = 100 * unique_diff_count / (max_table_count or 1) plus = len([1 for op, _ in diff if op == "+"]) minus = len([1 for op, _ in diff if op == "-"]) - print(f"Diff-Split: +{plus} -{minus}") + + if json_output: + json_output = { + "different_rows": len(diff), + "different_percent": percent, + "different_+": plus, + "different_-": minus, + "total": max_table_count, + } + print(json.dumps(json_output)) + else: + print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}") + print(f"Diff-Percent: {percent:.14f}%") + print(f"Diff-Split: +{plus} -{minus}") else: - for op, key in diff_iter: + for op, columns in diff_iter: color = COLOR_SCHEME[op] - rich.print(f"[{color}]{op} {key!r}[/{color}]") + + if json_output: + jsonl = json.dumps([op, list(columns)]) + rich.print(f"[{color}]{jsonl}[/{color}]") + else: + text = f"{op} {', '.join(columns)}" + rich.print(f"[{color}]{text}[/{color}]") + sys.stdout.flush() end = time.time() diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 4087b49d..ddad1ef1 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -381,6 +381,14 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None): if max_rows < self.bisection_threshold: rows1, rows2 = self._threaded_call("get_values", [table1, table2]) diff = list(diff_sets(rows1, rows2)) + + # Initial bisection_threshold larger than count. Normally we always + # checksum and count segments, even if we get the values. At the + # first level, however, that won't be true. + if level == 0: + self.stats["table1_count"] = len(rows1) + self.stats["table2_count"] = len(rows2) + logger.info(". " * level + f"Diff found {len(diff)} different rows.") self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) yield from diff @@ -421,6 +429,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun if level == 1: self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1 + self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2 if checksum1 != checksum2: yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2)) diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index a457081d..f2c9da19 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -155,6 +155,8 @@ def test_diff_small_tables(self): diff = list(self.differ.diff_tables(self.table, self.table2)) expected = [("-", ("2", time + ".000000"))] self.assertEqual(expected, diff) + self.assertEqual(2, self.differ.stats["table1_count"]) + self.assertEqual(1, self.differ.stats["table2_count"]) def test_diff_table_above_bisection_threshold(self): time = "2022-01-01 00:00:00" @@ -176,6 +178,8 @@ def test_diff_table_above_bisection_threshold(self): diff = list(self.differ.diff_tables(self.table, self.table2)) expected = [("-", ("5", time + ".000000"))] self.assertEqual(expected, diff) + self.assertEqual(5, self.differ.stats["table1_count"]) + self.assertEqual(4, self.differ.stats["table2_count"]) def test_return_empty_array_when_same(self): time = "2022-01-01 00:00:00"