@@ -131,6 +131,10 @@ def __post_init__(self):
131
131
class UnknownColType (ColType ):
132
132
text : str
133
133
134
+ def __post_init__ (self ):
135
+ logger .warn (f"Column of type '{ self .text } ' has no compatibility handling. "
136
+ "If encoding/formatting differs between databases, it may result in false positives." )
137
+
134
138
135
139
class AbstractDatabase (ABC ):
136
140
@abstractmethod
@@ -173,16 +177,24 @@ def close(self):
173
177
"Close connection(s) to the database instance. Querying will stop functioning."
174
178
...
175
179
180
+
176
181
@abstractmethod
177
- def normalize_value_by_type ( value : str , coltype : ColType ) -> str :
178
- """Creates an SQL expression, that converts 'value' to a normalized representation .
182
+ def normalize_timestamp ( self , value : str , coltype : ColType ) -> str :
183
+ """Creates an SQL expression, that converts 'value' to a normalized timestamp .
179
184
180
- The returned expression must accept any SQL value, and return a string.
185
+ The returned expression must accept any SQL datetime/timestamp, and return a string.
186
+
187
+ Date format: "YYYY-MM-DD HH:mm:SS.FFFFFF"
188
+
189
+ Precision of dates should be rounded up/down according to coltype.rounds
190
+ """
191
+ ...
181
192
182
- - Dates are expected in the format:
183
- "YYYY-MM-DD HH:mm:SS.FFFFFF"
193
+ @abstractmethod
194
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
195
+ """Creates an SQL expression, that converts 'value' to a normalized number.
184
196
185
- Rounded up/down according to coltype.rounds
197
+ The returned expression must accept any SQL int/numeric/float, and return a string.
186
198
187
199
- Floats/Decimals are expected in the format
188
200
"I.P"
@@ -191,14 +203,31 @@ def normalize_value_by_type(value: str, coltype: ColType) -> str:
191
203
and must be at least one digit (0).
192
204
P is the fractional digits, the amount of which is specified with
193
205
coltype.precision. Trailing zeroes may be necessary.
206
+ If P is 0, the dot is omitted.
194
207
195
208
Note: This precision is different than the one used by databases. For decimals,
196
- it's the same as "numeric_scale", and for floats, who use binary precision,
197
- it can be calculated as log10(2**p)
209
+ it's the same as ``numeric_scale``, and for floats, who use binary precision,
210
+ it can be calculated as ``log10(2**numeric_precision)``.
211
+ """
212
+ ...
213
+
214
+ def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
215
+ """Creates an SQL expression, that converts 'value' to a normalized representation.
216
+
217
+ The returned expression must accept any SQL value, and return a string.
218
+
219
+ The default implementation dispatches to a method according to ``coltype``:
198
220
221
+ TemporalType -> normalize_timestamp()
222
+ NumericType -> normalize_number()
223
+ -else- -> to_string()
199
224
200
225
"""
201
- ...
226
+ if isinstance (coltype , TemporalType ):
227
+ return self .normalize_timestamp (value , coltype )
228
+ elif isinstance (coltype , NumericType ):
229
+ return self .normalize_number (value , coltype )
230
+ return self .to_string (f"{ value } " )
202
231
203
232
204
233
class Database (AbstractDatabase ):
@@ -404,27 +433,16 @@ def md5_to_int(self, s: str) -> str:
404
433
def to_string (self , s : str ):
405
434
return f"{ s } ::varchar"
406
435
407
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
408
- if isinstance (coltype , TemporalType ):
409
- # if coltype.precision == 0:
410
- # return f"to_char({value}::timestamp(0), 'YYYY-mm-dd HH24:MI:SS')"
411
- # if coltype.precision == 3:
412
- # return f"to_char({value}, 'YYYY-mm-dd HH24:MI:SS.US')"
413
- # elif coltype.precision == 6:
414
- # return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
415
- # else:
416
- # # Postgres/Redshift doesn't support arbitrary precision
417
- # raise TypeError(f"Bad precision for {type(self).__name__}: {coltype})")
418
- if coltype .rounds :
419
- return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
420
- else :
421
- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
422
- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
423
436
424
- elif isinstance (coltype , NumericType ):
425
- value = f"{ value } ::decimal(38, { coltype .precision } )"
437
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
438
+ if coltype .rounds :
439
+ return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
426
440
427
- return self .to_string (f"{ value } " )
441
+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
442
+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
443
+
444
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
445
+ return self .to_string (f"{ value } ::decimal(38, { coltype .precision } )" )
428
446
429
447
430
448
class Presto (Database ):
@@ -463,25 +481,19 @@ def _query(self, sql_code: str) -> list:
463
481
def close (self ):
464
482
self ._conn .close ()
465
483
466
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
467
- if isinstance (coltype , TemporalType ):
468
- if coltype .rounds :
469
- if coltype .precision > 3 :
470
- pass
471
- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
472
- else :
473
- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
474
- # datetime = f"date_format(cast({value} as timestamp(6), '%Y-%m-%d %H:%i:%S.%f'))"
475
- # datetime = self.to_string(f"cast({value} as datetime(6))")
484
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
485
+ # TODO
486
+ if coltype .rounds :
487
+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
488
+ else :
489
+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
476
490
477
- return (
478
- f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
479
- )
480
-
481
- elif isinstance (coltype , NumericType ):
482
- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
491
+ return (
492
+ f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
493
+ )
483
494
484
- return self .to_string (value )
495
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
496
+ return self .to_string (f"cast({ value } as decimal(38,{ coltype .precision } ))" )
485
497
486
498
def select_table_schema (self , path : DbPath ) -> str :
487
499
schema , table = self ._normalize_table_path (path )
@@ -566,18 +578,16 @@ def md5_to_int(self, s: str) -> str:
566
578
def to_string (self , s : str ):
567
579
return f"cast({ s } as char)"
568
580
569
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
570
- if isinstance (coltype , TemporalType ):
571
- if coltype .rounds :
572
- return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
573
- else :
574
- s = self .to_string (f"cast({ value } as datetime(6))" )
575
- return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
581
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
582
+ if coltype .rounds :
583
+ return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
576
584
577
- elif isinstance (coltype , NumericType ):
578
- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
585
+ s = self .to_string (f"cast({ value } as datetime(6))" )
586
+ return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
587
+
588
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
589
+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
579
590
580
- return self .to_string (f"{ value } " )
581
591
582
592
583
593
class Oracle (ThreadedDatabase ):
@@ -622,16 +632,15 @@ def select_table_schema(self, path: DbPath) -> str:
622
632
f" FROM USER_TAB_COLUMNS WHERE table_name = '{ table .upper ()} '"
623
633
)
624
634
625
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
626
- if isinstance (coltype , TemporalType ):
627
- return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
628
- elif isinstance (coltype , NumericType ):
629
- # FM999.9990
630
- format_str = "FM" + "9" * (38 - coltype .precision )
631
- if coltype .precision :
632
- format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
633
- return f"to_char({ value } , '{ format_str } ')"
634
- return self .to_string (f"{ value } " )
635
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
636
+ return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
637
+
638
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
639
+ # FM999.9990
640
+ format_str = "FM" + "9" * (38 - coltype .precision )
641
+ if coltype .precision :
642
+ format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
643
+ return f"to_char({ value } , '{ format_str } ')"
635
644
636
645
def _parse_type (
637
646
self , type_repr : str , datetime_precision : int = None , numeric_precision : int = None , numeric_scale : int = None
@@ -682,27 +691,25 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
682
691
def md5_to_int (self , s : str ) -> str :
683
692
return f"strtol(substring(md5({ s } ), { 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS } ), 16)::decimal(38)"
684
693
685
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
686
- if isinstance (coltype , TemporalType ):
687
- if coltype .rounds :
688
- timestamp = f"{ value } ::timestamp(6)"
689
- # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
690
- secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
691
- # Get the milliseconds from timestamp.
692
- ms = f"extract(ms from { timestamp } )"
693
- # Get the microseconds from timestamp, without the milliseconds!
694
- us = f"extract(us from { timestamp } )"
695
- # epoch = Total time since epoch in microseconds.
696
- epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
697
- timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
698
- else :
699
- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
700
- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
694
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
695
+ if coltype .rounds :
696
+ timestamp = f"{ value } ::timestamp(6)"
697
+ # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
698
+ secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
699
+ # Get the milliseconds from timestamp.
700
+ ms = f"extract(ms from { timestamp } )"
701
+ # Get the microseconds from timestamp, without the milliseconds!
702
+ us = f"extract(us from { timestamp } )"
703
+ # epoch = Total time since epoch in microseconds.
704
+ epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
705
+ timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
706
+ else :
707
+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
708
+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
701
709
702
- elif isinstance ( coltype , NumericType ) :
703
- value = f"{ value } ::decimal(38,{ coltype .precision } )"
710
+ def normalize_number ( self , value : str , coltype : ColType ) -> str :
711
+ return self . to_string ( f"{ value } ::decimal(38,{ coltype .precision } )" )
704
712
705
- return self .to_string (f"{ value } " )
706
713
707
714
708
715
class MsSQL (ThreadedDatabase ):
@@ -794,25 +801,22 @@ def select_table_schema(self, path: DbPath) -> str:
794
801
f"WHERE table_name = '{ table } ' AND table_schema = '{ schema } '"
795
802
)
796
803
797
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
798
- if isinstance (coltype , TemporalType ):
799
- if coltype .rounds :
800
- timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
801
- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
802
- else :
803
- if coltype .precision == 0 :
804
- return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, { value } )"
805
- elif coltype .precision == 6 :
806
- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
804
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
805
+ if coltype .rounds :
806
+ timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
807
+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
807
808
808
- timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
809
- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
809
+ if coltype .precision == 0 :
810
+ return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, { value } )"
811
+ elif coltype .precision == 6 :
812
+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
810
813
811
- elif isinstance (coltype , NumericType ):
812
- # value = f"cast({value} as decimal)"
813
- return f"format('%.{ coltype .precision } f', cast({ value } as decimal))"
814
+ timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
815
+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
814
816
815
- return self .to_string (f"{ value } " )
817
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
818
+ # value = f"cast({value} as decimal)"
819
+ return f"format('%.{ coltype .precision } f', cast({ value } as decimal))"
816
820
817
821
def parse_table_name (self , name : str ) -> DbPath :
818
822
path = parse_table_name (name )
@@ -886,19 +890,16 @@ def select_table_schema(self, path: DbPath) -> str:
886
890
schema , table = self ._normalize_table_path (path )
887
891
return super ().select_table_schema ((schema , table ))
888
892
889
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
890
- if isinstance (coltype , TemporalType ):
891
- if coltype .rounds :
892
- timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
893
- else :
894
- timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
893
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
894
+ if coltype .rounds :
895
+ timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
896
+ else :
897
+ timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
895
898
896
- return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
899
+ return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
897
900
898
- elif isinstance (coltype , NumericType ):
899
- value = f"cast({ value } as decimal(38, { coltype .precision } ))"
900
-
901
- return self .to_string (f"{ value } " )
901
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
902
+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
902
903
903
904
904
905
@dataclass
0 commit comments