From f09a86625990ec466c1e9a2ff4a2e32c5881aee5 Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Tue, 21 Jun 2022 13:47:50 -0400 Subject: [PATCH 1/3] cli: output diff as jsonl, stats as json --- data_diff/__main__.py | 18 ++++++++++++++---- data_diff/diff_tables.py | 8 +++++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 5b357153..93ccf231 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -1,5 +1,6 @@ import sys import time +import json import logging from itertools import islice @@ -146,15 +147,24 @@ def main( unique_diff_count = len({i[0] for _, i in diff}) table1_count = differ.stats.get("table1_count") percent = 100 * unique_diff_count / (table1_count or 1) - print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}") - print(f"Diff-Percent: {percent:.4f}%") plus = len([1 for op, _ in diff if op == "+"]) minus = len([1 for op, _ in diff if op == "-"]) - print(f"Diff-Split: +{plus} -{minus}") + + count = differ.stats["table_count"] + diff = { + "different_rows": len(diff), + "different_percent": percent, + "different_+": plus, + "different_-": minus, + "total": count, + } + + print(json.dumps(diff, indent=2)) else: for op, key in diff_iter: color = COLOR_SCHEME[op] - rich.print(f"[{color}]{op} {key!r}[/{color}]") + jsonl = json.dumps([op, list(key)]) + rich.print(f"[{color}]{jsonl}[/{color}]") sys.stdout.flush() end = time.time() diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 4087b49d..05d082fa 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -381,6 +381,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None): if max_rows < self.bisection_threshold: rows1, rows2 = self._threaded_call("get_values", [table1, table2]) diff = list(diff_sets(rows1, rows2)) + + # This happens when the initial bisection threshold is larger than + # the table itself. + if level == 0 and not self.stats.get("table_count", False): + self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2)) + logger.info(". " * level + f"Diff found {len(diff)} different rows.") self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) yield from diff @@ -420,7 +426,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun return if level == 1: - self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1 + self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2) if checksum1 != checksum2: yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2)) From 94d1419091ee4c6266a4ea66713ff8ee280d9376 Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Tue, 21 Jun 2022 14:36:24 -0400 Subject: [PATCH 2/3] cli: add --json for stats, table1 + table2 counts --- data_diff/__main__.py | 29 +++++++++++++++++------------ data_diff/diff_tables.py | 8 +++++--- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index 93ccf231..fc32899e 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -51,6 +51,7 @@ @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.") @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff") @click.option("-d", "--debug", is_flag=True, help="Print debug info") +@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats") @click.option("-v", "--verbose", is_flag=True, help="Print extra info") @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug") @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.") @@ -81,6 +82,7 @@ def main( interactive, threads, keep_column_case, + json_output, ): if limit and stats: print("Error: cannot specify a limit when using the -s/--stats switch") @@ -145,21 +147,24 @@ def main( if stats: diff = list(diff_iter) unique_diff_count = len({i[0] for _, i in diff}) - table1_count = differ.stats.get("table1_count") - percent = 100 * unique_diff_count / (table1_count or 1) + max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"]) + percent = 100 * unique_diff_count / (max_table_count or 1) plus = len([1 for op, _ in diff if op == "+"]) minus = len([1 for op, _ in diff if op == "-"]) - count = differ.stats["table_count"] - diff = { - "different_rows": len(diff), - "different_percent": percent, - "different_+": plus, - "different_-": minus, - "total": count, - } - - print(json.dumps(diff, indent=2)) + if json_output: + json_output = { + "different_rows": len(diff), + "different_percent": percent, + "different_+": plus, + "different_-": minus, + "total": max_table_count, + } + print(json.dumps(json_output, indent=2)) + else: + print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}") + print(f"Diff-Percent: {percent:.14f}%") + print(f"Diff-Split: +{plus} -{minus}") else: for op, key in diff_iter: color = COLOR_SCHEME[op] diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 05d082fa..48099c8f 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -384,8 +384,9 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None): # This happens when the initial bisection threshold is larger than # the table itself. - if level == 0 and not self.stats.get("table_count", False): - self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2)) + if level == 0 and not self.stats.get("table1_count", False): + self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1) + self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2) logger.info(". " * level + f"Diff found {len(diff)} different rows.") self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) @@ -426,7 +427,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun return if level == 1: - self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2) + self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1 + self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2 if checksum1 != checksum2: yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2)) From 37b47a0bdde598ce3c22a1e4f76e79c68c994dc4 Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Wed, 22 Jun 2022 09:44:57 -0400 Subject: [PATCH 3/3] cli: only json from standard diff with --json --- data_diff/__main__.py | 16 +++++++++++----- data_diff/diff_tables.py | 15 ++++++++------- tests/test_diff_tables.py | 4 ++++ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/data_diff/__main__.py b/data_diff/__main__.py index fc32899e..6ee6992c 100644 --- a/data_diff/__main__.py +++ b/data_diff/__main__.py @@ -51,7 +51,7 @@ @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.") @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff") @click.option("-d", "--debug", is_flag=True, help="Print debug info") -@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats") +@click.option("--json", 'json_output', is_flag=True, help="Print JSONL output for machine readability") @click.option("-v", "--verbose", is_flag=True, help="Print extra info") @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug") @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.") @@ -160,16 +160,22 @@ def main( "different_-": minus, "total": max_table_count, } - print(json.dumps(json_output, indent=2)) + print(json.dumps(json_output)) else: print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}") print(f"Diff-Percent: {percent:.14f}%") print(f"Diff-Split: +{plus} -{minus}") else: - for op, key in diff_iter: + for op, columns in diff_iter: color = COLOR_SCHEME[op] - jsonl = json.dumps([op, list(key)]) - rich.print(f"[{color}]{jsonl}[/{color}]") + + if json_output: + jsonl = json.dumps([op, list(columns)]) + rich.print(f"[{color}]{jsonl}[/{color}]") + else: + text = f"{op} {', '.join(columns)}" + rich.print(f"[{color}]{text}[/{color}]") + sys.stdout.flush() end = time.time() diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 48099c8f..ddad1ef1 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -382,11 +382,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None): rows1, rows2 = self._threaded_call("get_values", [table1, table2]) diff = list(diff_sets(rows1, rows2)) - # This happens when the initial bisection threshold is larger than - # the table itself. - if level == 0 and not self.stats.get("table1_count", False): - self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1) - self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2) + # Initial bisection_threshold larger than count. Normally we always + # checksum and count segments, even if we get the values. At the + # first level, however, that won't be true. + if level == 0: + self.stats["table1_count"] = len(rows1) + self.stats["table2_count"] = len(rows2) logger.info(". " * level + f"Diff found {len(diff)} different rows.") self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) @@ -427,8 +428,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun return if level == 1: - self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1 - self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2 + self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1 + self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2 if checksum1 != checksum2: yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2)) diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index a457081d..f2c9da19 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -155,6 +155,8 @@ def test_diff_small_tables(self): diff = list(self.differ.diff_tables(self.table, self.table2)) expected = [("-", ("2", time + ".000000"))] self.assertEqual(expected, diff) + self.assertEqual(2, self.differ.stats["table1_count"]) + self.assertEqual(1, self.differ.stats["table2_count"]) def test_diff_table_above_bisection_threshold(self): time = "2022-01-01 00:00:00" @@ -176,6 +178,8 @@ def test_diff_table_above_bisection_threshold(self): diff = list(self.differ.diff_tables(self.table, self.table2)) expected = [("-", ("5", time + ".000000"))] self.assertEqual(expected, diff) + self.assertEqual(5, self.differ.stats["table1_count"]) + self.assertEqual(4, self.differ.stats["table2_count"]) def test_return_empty_array_when_same(self): time = "2022-01-01 00:00:00"