Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit c268120

Browse files
committed
Refactor into normalize_timestamp() normalize_number()
1 parent c7367ba commit c268120

File tree

1 file changed

+112
-111
lines changed

1 file changed

+112
-111
lines changed

data_diff/database.py

Lines changed: 112 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ def __post_init__(self):
131131
class UnknownColType(ColType):
132132
text: str
133133

134+
def __post_init__(self):
135+
logger.warn(f"Column of type '{self.text}' has no compatibility handling. "
136+
"If encoding/formatting differs between databases, it may result in false positives.")
137+
134138

135139
class AbstractDatabase(ABC):
136140
@abstractmethod
@@ -173,16 +177,24 @@ def close(self):
173177
"Close connection(s) to the database instance. Querying will stop functioning."
174178
...
175179

180+
176181
@abstractmethod
177-
def normalize_value_by_type(value: str, coltype: ColType) -> str:
178-
"""Creates an SQL expression, that converts 'value' to a normalized representation.
182+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
183+
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
179184
180-
The returned expression must accept any SQL value, and return a string.
185+
The returned expression must accept any SQL datetime/timestamp, and return a string.
186+
187+
Date format: "YYYY-MM-DD HH:mm:SS.FFFFFF"
188+
189+
Precision of dates should be rounded up/down according to coltype.rounds
190+
"""
191+
...
181192

182-
- Dates are expected in the format:
183-
"YYYY-MM-DD HH:mm:SS.FFFFFF"
193+
@abstractmethod
194+
def normalize_number(self, value: str, coltype: ColType) -> str:
195+
"""Creates an SQL expression, that converts 'value' to a normalized number.
184196
185-
Rounded up/down according to coltype.rounds
197+
The returned expression must accept any SQL int/numeric/float, and return a string.
186198
187199
- Floats/Decimals are expected in the format
188200
"I.P"
@@ -191,14 +203,31 @@ def normalize_value_by_type(value: str, coltype: ColType) -> str:
191203
and must be at least one digit (0).
192204
P is the fractional digits, the amount of which is specified with
193205
coltype.precision. Trailing zeroes may be necessary.
206+
If P is 0, the dot is omitted.
194207
195208
Note: This precision is different than the one used by databases. For decimals,
196-
it's the same as "numeric_scale", and for floats, who use binary precision,
197-
it can be calculated as log10(2**p)
209+
it's the same as ``numeric_scale``, and for floats, who use binary precision,
210+
it can be calculated as ``log10(2**numeric_precision)``.
211+
"""
212+
...
213+
214+
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
215+
"""Creates an SQL expression, that converts 'value' to a normalized representation.
216+
217+
The returned expression must accept any SQL value, and return a string.
218+
219+
The default implementation dispatches to a method according to ``coltype``:
198220
221+
TemporalType -> normalize_timestamp()
222+
NumericType -> normalize_number()
223+
-else- -> to_string()
199224
200225
"""
201-
...
226+
if isinstance(coltype, TemporalType):
227+
return self.normalize_timestamp(value, coltype)
228+
elif isinstance(coltype, NumericType):
229+
return self.normalize_number(value, coltype)
230+
return self.to_string(f"{value}")
202231

203232

204233
class Database(AbstractDatabase):
@@ -404,27 +433,16 @@ def md5_to_int(self, s: str) -> str:
404433
def to_string(self, s: str):
405434
return f"{s}::varchar"
406435

407-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
408-
if isinstance(coltype, TemporalType):
409-
# if coltype.precision == 0:
410-
# return f"to_char({value}::timestamp(0), 'YYYY-mm-dd HH24:MI:SS')"
411-
# if coltype.precision == 3:
412-
# return f"to_char({value}, 'YYYY-mm-dd HH24:MI:SS.US')"
413-
# elif coltype.precision == 6:
414-
# return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
415-
# else:
416-
# # Postgres/Redshift doesn't support arbitrary precision
417-
# raise TypeError(f"Bad precision for {type(self).__name__}: {coltype})")
418-
if coltype.rounds:
419-
return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
420-
else:
421-
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
422-
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
423436

424-
elif isinstance(coltype, NumericType):
425-
value = f"{value}::decimal(38, {coltype.precision})"
437+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
438+
if coltype.rounds:
439+
return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
426440

427-
return self.to_string(f"{value}")
441+
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
442+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
443+
444+
def normalize_number(self, value: str, coltype: ColType) -> str:
445+
return self.to_string(f"{value}::decimal(38, {coltype.precision})")
428446

429447

430448
class Presto(Database):
@@ -463,25 +481,19 @@ def _query(self, sql_code: str) -> list:
463481
def close(self):
464482
self._conn.close()
465483

466-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
467-
if isinstance(coltype, TemporalType):
468-
if coltype.rounds:
469-
if coltype.precision > 3:
470-
pass
471-
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
472-
else:
473-
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
474-
# datetime = f"date_format(cast({value} as timestamp(6), '%Y-%m-%d %H:%i:%S.%f'))"
475-
# datetime = self.to_string(f"cast({value} as datetime(6))")
484+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
485+
# TODO
486+
if coltype.rounds:
487+
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
488+
else:
489+
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
476490

477-
return (
478-
f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
479-
)
480-
481-
elif isinstance(coltype, NumericType):
482-
value = f"cast({value} as decimal(38,{coltype.precision}))"
491+
return (
492+
f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
493+
)
483494

484-
return self.to_string(value)
495+
def normalize_number(self, value: str, coltype: ColType) -> str:
496+
return self.to_string(f"cast({value} as decimal(38,{coltype.precision}))")
485497

486498
def select_table_schema(self, path: DbPath) -> str:
487499
schema, table = self._normalize_table_path(path)
@@ -566,18 +578,16 @@ def md5_to_int(self, s: str) -> str:
566578
def to_string(self, s: str):
567579
return f"cast({s} as char)"
568580

569-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
570-
if isinstance(coltype, TemporalType):
571-
if coltype.rounds:
572-
return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))")
573-
else:
574-
s = self.to_string(f"cast({value} as datetime(6))")
575-
return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
581+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
582+
if coltype.rounds:
583+
return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))")
576584

577-
elif isinstance(coltype, NumericType):
578-
value = f"cast({value} as decimal(38,{coltype.precision}))"
585+
s = self.to_string(f"cast({value} as datetime(6))")
586+
return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
587+
588+
def normalize_number(self, value: str, coltype: ColType) -> str:
589+
return self.to_string(f"cast({value} as decimal(38, {coltype.precision}))")
579590

580-
return self.to_string(f"{value}")
581591

582592

583593
class Oracle(ThreadedDatabase):
@@ -622,16 +632,15 @@ def select_table_schema(self, path: DbPath) -> str:
622632
f" FROM USER_TAB_COLUMNS WHERE table_name = '{table.upper()}'"
623633
)
624634

625-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
626-
if isinstance(coltype, TemporalType):
627-
return f"to_char(cast({value} as timestamp({coltype.precision})), 'YYYY-MM-DD HH24:MI:SS.FF6')"
628-
elif isinstance(coltype, NumericType):
629-
# FM999.9990
630-
format_str = "FM" + "9" * (38 - coltype.precision)
631-
if coltype.precision:
632-
format_str += "0." + "9" * (coltype.precision - 1) + "0"
633-
return f"to_char({value}, '{format_str}')"
634-
return self.to_string(f"{value}")
635+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
636+
return f"to_char(cast({value} as timestamp({coltype.precision})), 'YYYY-MM-DD HH24:MI:SS.FF6')"
637+
638+
def normalize_number(self, value: str, coltype: ColType) -> str:
639+
# FM999.9990
640+
format_str = "FM" + "9" * (38 - coltype.precision)
641+
if coltype.precision:
642+
format_str += "0." + "9" * (coltype.precision - 1) + "0"
643+
return f"to_char({value}, '{format_str}')"
635644

636645
def _parse_type(
637646
self, type_repr: str, datetime_precision: int = None, numeric_precision: int = None, numeric_scale: int = None
@@ -682,27 +691,25 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
682691
def md5_to_int(self, s: str) -> str:
683692
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38)"
684693

685-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
686-
if isinstance(coltype, TemporalType):
687-
if coltype.rounds:
688-
timestamp = f"{value}::timestamp(6)"
689-
# Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
690-
secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)"
691-
# Get the milliseconds from timestamp.
692-
ms = f"extract(ms from {timestamp})"
693-
# Get the microseconds from timestamp, without the milliseconds!
694-
us = f"extract(us from {timestamp})"
695-
# epoch = Total time since epoch in microseconds.
696-
epoch = f"{secs}*1000000 + {ms}*1000 + {us}"
697-
timestamp6 = f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
698-
else:
699-
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
700-
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
694+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
695+
if coltype.rounds:
696+
timestamp = f"{value}::timestamp(6)"
697+
# Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
698+
secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)"
699+
# Get the milliseconds from timestamp.
700+
ms = f"extract(ms from {timestamp})"
701+
# Get the microseconds from timestamp, without the milliseconds!
702+
us = f"extract(us from {timestamp})"
703+
# epoch = Total time since epoch in microseconds.
704+
epoch = f"{secs}*1000000 + {ms}*1000 + {us}"
705+
timestamp6 = f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
706+
else:
707+
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
708+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
701709

702-
elif isinstance(coltype, NumericType):
703-
value = f"{value}::decimal(38,{coltype.precision})"
710+
def normalize_number(self, value: str, coltype: ColType) -> str:
711+
return self.to_string(f"{value}::decimal(38,{coltype.precision})")
704712

705-
return self.to_string(f"{value}")
706713

707714

708715
class MsSQL(ThreadedDatabase):
@@ -794,25 +801,22 @@ def select_table_schema(self, path: DbPath) -> str:
794801
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
795802
)
796803

797-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
798-
if isinstance(coltype, TemporalType):
799-
if coltype.rounds:
800-
timestamp = f"timestamp_micros(cast(round(unix_micros(cast({value} as timestamp))/1000000, {coltype.precision})*1000000 as int))"
801-
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {timestamp})"
802-
else:
803-
if coltype.precision == 0:
804-
return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, {value})"
805-
elif coltype.precision == 6:
806-
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
804+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
805+
if coltype.rounds:
806+
timestamp = f"timestamp_micros(cast(round(unix_micros(cast({value} as timestamp))/1000000, {coltype.precision})*1000000 as int))"
807+
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {timestamp})"
807808

808-
timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
809-
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
809+
if coltype.precision == 0:
810+
return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, {value})"
811+
elif coltype.precision == 6:
812+
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
810813

811-
elif isinstance(coltype, NumericType):
812-
# value = f"cast({value} as decimal)"
813-
return f"format('%.{coltype.precision}f', cast({value} as decimal))"
814+
timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
815+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
814816

815-
return self.to_string(f"{value}")
817+
def normalize_number(self, value: str, coltype: ColType) -> str:
818+
# value = f"cast({value} as decimal)"
819+
return f"format('%.{coltype.precision}f', cast({value} as decimal))"
816820

817821
def parse_table_name(self, name: str) -> DbPath:
818822
path = parse_table_name(name)
@@ -886,19 +890,16 @@ def select_table_schema(self, path: DbPath) -> str:
886890
schema, table = self._normalize_table_path(path)
887891
return super().select_table_schema((schema, table))
888892

889-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
890-
if isinstance(coltype, TemporalType):
891-
if coltype.rounds:
892-
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, {value}::timestamp(9))/1000000000, {coltype.precision}))"
893-
else:
894-
timestamp = f"cast({value} as timestamp({coltype.precision}))"
893+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
894+
if coltype.rounds:
895+
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, {value}::timestamp(9))/1000000000, {coltype.precision}))"
896+
else:
897+
timestamp = f"cast({value} as timestamp({coltype.precision}))"
895898

896-
return f"to_char({timestamp}, 'YYYY-MM-DD HH24:MI:SS.FF6')"
899+
return f"to_char({timestamp}, 'YYYY-MM-DD HH24:MI:SS.FF6')"
897900

898-
elif isinstance(coltype, NumericType):
899-
value = f"cast({value} as decimal(38, {coltype.precision}))"
900-
901-
return self.to_string(f"{value}")
901+
def normalize_number(self, value: str, coltype: ColType) -> str:
902+
return self.to_string(f"cast({value} as decimal(38, {coltype.precision}))")
902903

903904

904905
@dataclass

0 commit comments

Comments
 (0)