Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 449c6bf

Browse files
authored
Merge pull request #90 from datafold/jsonl-output
2 parents ee6c593 + 37b47a0 commit 449c6bf

File tree

3 files changed

+41
-7
lines changed

3 files changed

+41
-7
lines changed

data_diff/__main__.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import sys
22
import time
3+
import json
34
import logging
45
from itertools import islice
56

@@ -50,6 +51,7 @@
5051
@click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
5152
@click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
5253
@click.option("-d", "--debug", is_flag=True, help="Print debug info")
54+
@click.option("--json", 'json_output', is_flag=True, help="Print JSONL output for machine readability")
5355
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
5456
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
5557
@click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
@@ -80,6 +82,7 @@ def main(
8082
interactive,
8183
threads,
8284
keep_column_case,
85+
json_output,
8386
):
8487
if limit and stats:
8588
print("Error: cannot specify a limit when using the -s/--stats switch")
@@ -144,17 +147,35 @@ def main(
144147
if stats:
145148
diff = list(diff_iter)
146149
unique_diff_count = len({i[0] for _, i in diff})
147-
table1_count = differ.stats.get("table1_count")
148-
percent = 100 * unique_diff_count / (table1_count or 1)
149-
print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
150-
print(f"Diff-Percent: {percent:.4f}%")
150+
max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
151+
percent = 100 * unique_diff_count / (max_table_count or 1)
151152
plus = len([1 for op, _ in diff if op == "+"])
152153
minus = len([1 for op, _ in diff if op == "-"])
153-
print(f"Diff-Split: +{plus} -{minus}")
154+
155+
if json_output:
156+
json_output = {
157+
"different_rows": len(diff),
158+
"different_percent": percent,
159+
"different_+": plus,
160+
"different_-": minus,
161+
"total": max_table_count,
162+
}
163+
print(json.dumps(json_output))
164+
else:
165+
print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
166+
print(f"Diff-Percent: {percent:.14f}%")
167+
print(f"Diff-Split: +{plus} -{minus}")
154168
else:
155-
for op, key in diff_iter:
169+
for op, columns in diff_iter:
156170
color = COLOR_SCHEME[op]
157-
rich.print(f"[{color}]{op} {key!r}[/{color}]")
171+
172+
if json_output:
173+
jsonl = json.dumps([op, list(columns)])
174+
rich.print(f"[{color}]{jsonl}[/{color}]")
175+
else:
176+
text = f"{op} {', '.join(columns)}"
177+
rich.print(f"[{color}]{text}[/{color}]")
178+
158179
sys.stdout.flush()
159180

160181
end = time.time()

data_diff/diff_tables.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,14 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
403403
if max_rows < self.bisection_threshold:
404404
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
405405
diff = list(diff_sets(rows1, rows2))
406+
407+
# Initial bisection_threshold larger than count. Normally we always
408+
# checksum and count segments, even if we get the values. At the
409+
# first level, however, that won't be true.
410+
if level == 0:
411+
self.stats["table1_count"] = len(rows1)
412+
self.stats["table2_count"] = len(rows2)
413+
406414
logger.info(". " * level + f"Diff found {len(diff)} different rows.")
407415
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
408416
yield from diff
@@ -443,6 +451,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
443451

444452
if level == 1:
445453
self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
454+
self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2
446455

447456
if checksum1 != checksum2:
448457
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))

tests/test_diff_tables.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ def test_diff_small_tables(self):
171171
diff = list(self.differ.diff_tables(self.table, self.table2))
172172
expected = [("-", ("2", time + ".000000"))]
173173
self.assertEqual(expected, diff)
174+
self.assertEqual(2, self.differ.stats["table1_count"])
175+
self.assertEqual(1, self.differ.stats["table2_count"])
174176

175177
def test_diff_table_above_bisection_threshold(self):
176178
time = "2022-01-01 00:00:00"
@@ -192,6 +194,8 @@ def test_diff_table_above_bisection_threshold(self):
192194
diff = list(self.differ.diff_tables(self.table, self.table2))
193195
expected = [("-", ("5", time + ".000000"))]
194196
self.assertEqual(expected, diff)
197+
self.assertEqual(5, self.differ.stats["table1_count"])
198+
self.assertEqual(4, self.differ.stats["table2_count"])
195199

196200
def test_return_empty_array_when_same(self):
197201
time = "2022-01-01 00:00:00"

0 commit comments

Comments
 (0)