Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit e926428

Browse files
committed
cli: output diff as jsonl, stats as json
1 parent 2907d15 commit e926428

File tree

2 files changed

+21
-5
lines changed

2 files changed

+21
-5
lines changed

data_diff/__main__.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import sys
22
import time
3+
import json
34
import logging
45
from itertools import islice
56

@@ -146,15 +147,24 @@ def main(
146147
unique_diff_count = len({i[0] for _, i in diff})
147148
table1_count = differ.stats.get("table1_count")
148149
percent = 100 * unique_diff_count / (table1_count or 1)
149-
print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
150-
print(f"Diff-Percent: {percent:.4f}%")
151150
plus = len([1 for op, _ in diff if op == "+"])
152151
minus = len([1 for op, _ in diff if op == "-"])
153-
print(f"Diff-Split: +{plus} -{minus}")
152+
153+
count = differ.stats["table_count"]
154+
diff = {
155+
"different_rows": len(diff),
156+
"different_percent": percent,
157+
"different_+": plus,
158+
"different_-": minus,
159+
"total": count,
160+
}
161+
162+
print(json.dumps(diff, indent=2))
154163
else:
155164
for op, key in diff_iter:
156165
color = COLOR_SCHEME[op]
157-
rich.print(f"[{color}]{op} {key!r}[/{color}]")
166+
jsonl = json.dumps([op] + list(key))
167+
rich.print(f"[{color}]{jsonl}[/{color}]")
158168
sys.stdout.flush()
159169

160170
end = time.time()

data_diff/diff_tables.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
381381
if max_rows < self.bisection_threshold:
382382
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
383383
diff = list(diff_sets(rows1, rows2))
384+
385+
# This happens when the initial bisection threshold is larger than
386+
# the table itself.
387+
if level == 0 and not self.stats.get("table_count", False):
388+
self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2))
389+
384390
logger.info(". " * level + f"Diff found {len(diff)} different rows.")
385391
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
386392
yield from diff
@@ -420,7 +426,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
420426
return
421427

422428
if level == 1:
423-
self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
429+
self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2)
424430

425431
if checksum1 != checksum2:
426432
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))

0 commit comments

Comments
 (0)