Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

cli: output diff as jsonl, stats as json #90

Merged
merged 3 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions data_diff/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import time
import json
import logging
from itertools import islice

Expand Down Expand Up @@ -50,6 +51,7 @@
@click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
@click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
@click.option("-d", "--debug", is_flag=True, help="Print debug info")
@click.option("--json", 'json_output', is_flag=True, help="Print JSONL output for machine readability")
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
@click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
Expand Down Expand Up @@ -80,6 +82,7 @@ def main(
interactive,
threads,
keep_column_case,
json_output,
):
if limit and stats:
print("Error: cannot specify a limit when using the -s/--stats switch")
Expand Down Expand Up @@ -144,17 +147,35 @@ def main(
if stats:
diff = list(diff_iter)
unique_diff_count = len({i[0] for _, i in diff})
table1_count = differ.stats.get("table1_count")
percent = 100 * unique_diff_count / (table1_count or 1)
print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
print(f"Diff-Percent: {percent:.4f}%")
max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
percent = 100 * unique_diff_count / (max_table_count or 1)
plus = len([1 for op, _ in diff if op == "+"])
minus = len([1 for op, _ in diff if op == "-"])
print(f"Diff-Split: +{plus} -{minus}")

if json_output:
json_output = {
"different_rows": len(diff),
"different_percent": percent,
"different_+": plus,
"different_-": minus,
"total": max_table_count,
}
print(json.dumps(json_output))
else:
print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
print(f"Diff-Percent: {percent:.14f}%")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW I stopped rounding aggressively here because e.g. when I have 1 different row for 100m entries, it just shows 0 with that previous rounding

print(f"Diff-Split: +{plus} -{minus}")
else:
for op, key in diff_iter:
for op, columns in diff_iter:
color = COLOR_SCHEME[op]
rich.print(f"[{color}]{op} {key!r}[/{color}]")

if json_output:
jsonl = json.dumps([op, list(columns)])
rich.print(f"[{color}]{jsonl}[/{color}]")
else:
text = f"{op} {', '.join(columns)}"
rich.print(f"[{color}]{text}[/{color}]")

sys.stdout.flush()

end = time.time()
Expand Down
9 changes: 9 additions & 0 deletions data_diff/diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,14 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
if max_rows < self.bisection_threshold:
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
diff = list(diff_sets(rows1, rows2))

# Initial bisection_threshold larger than count. Normally we always
# checksum and count segments, even if we get the values. At the
# first level, however, that won't be true.
if level == 0:
self.stats["table1_count"] = len(rows1)
self.stats["table2_count"] = len(rows2)

logger.info(". " * level + f"Diff found {len(diff)} different rows.")
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
yield from diff
Expand Down Expand Up @@ -421,6 +429,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun

if level == 1:
self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2

if checksum1 != checksum2:
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
Expand Down
4 changes: 4 additions & 0 deletions tests/test_diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ def test_diff_small_tables(self):
diff = list(self.differ.diff_tables(self.table, self.table2))
expected = [("-", ("2", time + ".000000"))]
self.assertEqual(expected, diff)
self.assertEqual(2, self.differ.stats["table1_count"])
self.assertEqual(1, self.differ.stats["table2_count"])

def test_diff_table_above_bisection_threshold(self):
time = "2022-01-01 00:00:00"
Expand All @@ -176,6 +178,8 @@ def test_diff_table_above_bisection_threshold(self):
diff = list(self.differ.diff_tables(self.table, self.table2))
expected = [("-", ("5", time + ".000000"))]
self.assertEqual(expected, diff)
self.assertEqual(5, self.differ.stats["table1_count"])
self.assertEqual(4, self.differ.stats["table2_count"])

def test_return_empty_array_when_same(self):
time = "2022-01-01 00:00:00"
Expand Down