1
1
from dataclasses import dataclass
2
2
from typing import List
3
3
4
+ import covidcast
5
+ import numpy as np
4
6
import pandas as pd
5
7
6
8
@dataclass
@@ -27,33 +29,87 @@ def to_md(self):
27
29
message = self .message , updated = self .last_updated .strftime ("%Y-%m-%d" ))
28
30
29
31
def check_source (data_source , meta , params , grace ):
30
- """Iterate over all signals from a source and check if they exceed max age."""
32
+ """Iterate over all signals from a source and check for problems.
33
+
34
+ Possible problems:
35
+
36
+ - Newest available data exceeds max age.
37
+ - Gap between subsequent data points exceeds max gap.
38
+
39
+ For example, consider a source with a max age of 5 days and max gap of 1
40
+ day. If today is 2020-10-15, and the latest available data is from
41
+ 2020-10-09, the max age is exceeded. If there is no data available on
42
+ 2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
43
+ days and the max gap is exceeded.
44
+
45
+ The gap window controls how much data we check for gaps -- a gap window of
46
+ 10 days means we check the most recent 10 days of data. Defaults to 7.
47
+
48
+ """
31
49
32
50
source_config = params [data_source ]
51
+ gap_window = pd .Timedelta (days = source_config .get ("gap_window" , 7 ))
52
+ max_allowed_gap = source_config .get ("max_gap" , 1 )
33
53
34
54
signals = meta [meta .data_source == data_source ]
35
55
36
56
now = pd .Timestamp .now ()
37
57
38
- complaints = {}
58
+ age_complaints = {}
59
+ gap_complaints = {}
39
60
40
61
for _ , row in signals .iterrows ():
41
62
if "retired-signals" in source_config and \
42
63
row ["signal" ] in source_config ["retired-signals" ]:
43
64
continue
44
65
66
+ # Check max age
45
67
age = (now - row ["max_time" ]).days
46
68
47
69
if age > source_config ["max_age" ] + grace :
48
- if row ["signal" ] not in complaints :
49
- complaints [row ["signal" ]] = Complaint (
70
+ if row ["signal" ] not in age_complaints :
71
+ age_complaints [row ["signal" ]] = Complaint (
50
72
"is more than {age} days old" .format (age = age ),
51
73
data_source ,
52
74
row ["signal" ],
53
75
[row ["geo_type" ]],
54
76
row ["max_time" ],
55
77
source_config ["maintainers" ])
56
78
else :
57
- complaints [row ["signal" ]].geo_types .append (row ["geo_type" ])
79
+ age_complaints [row ["signal" ]].geo_types .append (row ["geo_type" ])
80
+
81
+ # Check max gap
82
+ if max_allowed_gap == - 1 :
83
+ # No gap detection for this source
84
+ continue
85
+
86
+ latest_data = covidcast .signal (
87
+ data_source , row ["signal" ],
88
+ start_day = row ["max_time" ] - gap_window ,
89
+ end_day = row ["max_time" ],
90
+ geo_type = row ["geo_type" ]
91
+ )
92
+
93
+ # convert numpy datetime values to pandas datetimes and then to
94
+ # datetime.date, so we can work with timedeltas after
95
+ unique_dates = [pd .to_datetime (val ).date ()
96
+ for val in latest_data ["time_value" ].unique ()]
97
+
98
+ gap_days = [(day - prev_day ).days
99
+ for day , prev_day in zip (unique_dates [1 :], unique_dates [:- 1 ])]
100
+ gap = max (gap_days )
101
+
102
+ if gap > max_allowed_gap :
103
+ if row ["signal" ] not in gap_complaints :
104
+ gap_complaints [row ["signal" ]] = Complaint (
105
+ "has a {gap}-day gap of missing data in its most recent "
106
+ "{gap_window} days of data" .format (gap = gap , gap_window = gap_window .days ),
107
+ data_source ,
108
+ row ["signal" ],
109
+ [row ["geo_type" ]],
110
+ row ["max_time" ],
111
+ source_config ["maintainers" ])
112
+ else :
113
+ gap_complaints [row ["signal" ]].geo_types .append (row ["geo_type" ])
58
114
59
- return list (complaints .values ())
115
+ return list (age_complaints . values ()) + list ( gap_complaints .values ())
0 commit comments