Skip to content

Commit 0783ebd

Browse files
authored
Merge pull request #327 from cmu-delphi/sir-gapdetector
Add a gap detector to Sir Complains-a-lot
2 parents 261503a + 39df546 commit 0783ebd

File tree

3 files changed

+72
-7
lines changed

3 files changed

+72
-7
lines changed

sir_complainsalot/delphi_sir_complainsalot/check_source.py

+62-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from dataclasses import dataclass
22
from typing import List
33

4+
import covidcast
5+
import numpy as np
46
import pandas as pd
57

68
@dataclass
@@ -27,33 +29,87 @@ def to_md(self):
2729
message=self.message, updated=self.last_updated.strftime("%Y-%m-%d"))
2830

2931
def check_source(data_source, meta, params, grace):
30-
"""Iterate over all signals from a source and check if they exceed max age."""
32+
"""Iterate over all signals from a source and check for problems.
33+
34+
Possible problems:
35+
36+
- Newest available data exceeds max age.
37+
- Gap between subsequent data points exceeds max gap.
38+
39+
For example, consider a source with a max age of 5 days and max gap of 1
40+
day. If today is 2020-10-15, and the latest available data is from
41+
2020-10-09, the max age is exceeded. If there is no data available on
42+
2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
43+
days and the max gap is exceeded.
44+
45+
The gap window controls how much data we check for gaps -- a gap window of
46+
10 days means we check the most recent 10 days of data. Defaults to 7.
47+
48+
"""
3149

3250
source_config = params[data_source]
51+
gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
52+
max_allowed_gap = source_config.get("max_gap", 1)
3353

3454
signals = meta[meta.data_source == data_source]
3555

3656
now = pd.Timestamp.now()
3757

38-
complaints = {}
58+
age_complaints = {}
59+
gap_complaints = {}
3960

4061
for _, row in signals.iterrows():
4162
if "retired-signals" in source_config and \
4263
row["signal"] in source_config["retired-signals"]:
4364
continue
4465

66+
# Check max age
4567
age = (now - row["max_time"]).days
4668

4769
if age > source_config["max_age"] + grace:
48-
if row["signal"] not in complaints:
49-
complaints[row["signal"]] = Complaint(
70+
if row["signal"] not in age_complaints:
71+
age_complaints[row["signal"]] = Complaint(
5072
"is more than {age} days old".format(age=age),
5173
data_source,
5274
row["signal"],
5375
[row["geo_type"]],
5476
row["max_time"],
5577
source_config["maintainers"])
5678
else:
57-
complaints[row["signal"]].geo_types.append(row["geo_type"])
79+
age_complaints[row["signal"]].geo_types.append(row["geo_type"])
80+
81+
# Check max gap
82+
if max_allowed_gap == -1:
83+
# No gap detection for this source
84+
continue
85+
86+
latest_data = covidcast.signal(
87+
data_source, row["signal"],
88+
start_day=row["max_time"] - gap_window,
89+
end_day=row["max_time"],
90+
geo_type=row["geo_type"]
91+
)
92+
93+
# convert numpy datetime values to pandas datetimes and then to
94+
# datetime.date, so we can work with timedeltas after
95+
unique_dates = [pd.to_datetime(val).date()
96+
for val in latest_data["time_value"].unique()]
97+
98+
gap_days = [(day - prev_day).days
99+
for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]
100+
gap = max(gap_days)
101+
102+
if gap > max_allowed_gap:
103+
if row["signal"] not in gap_complaints:
104+
gap_complaints[row["signal"]] = Complaint(
105+
"has a {gap}-day gap of missing data in its most recent "
106+
"{gap_window} days of data".format(gap=gap, gap_window=gap_window.days),
107+
data_source,
108+
row["signal"],
109+
[row["geo_type"]],
110+
row["max_time"],
111+
source_config["maintainers"])
112+
else:
113+
gap_complaints[row["signal"]].geo_types.append(row["geo_type"])
58114

59-
return list(complaints.values())
115+
return list(age_complaints.values()) + list(gap_complaints.values())

sir_complainsalot/delphi_sir_complainsalot/run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def run_module():
2222

2323
complaints = []
2424
for data_source in params["sources"].keys():
25-
complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace",0)))
25+
complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace", 0)))
2626

2727
if len(complaints) > 0:
2828
for complaint in complaints:

sir_complainsalot/params.json.template

+9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
"max_age": 5,
77
"maintainers": ["U010VE2T51N"]
88
},
9+
"hospital-admissions": {
10+
"max_age": 5,
11+
"maintainers": ["U010VE2T51N"],
12+
"retired-signals": ["smoothed_covid19", "smoothed_adj_covid19"]
13+
},
914
"ght": {
1015
"max_age": 5,
1116
"maintainers": ["U010VE2T51N"]
@@ -14,6 +19,10 @@
1419
"max_age": 2,
1520
"maintainers": ["UUCGWMJ5P"]
1621
},
22+
"usa-facts": {
23+
"max_age": 2,
24+
"maintainers": ["UUCGWMJ5P"]
25+
},
1726
"safegraph": {
1827
"max_age": 4,
1928
"maintainers": ["U010VE2T51N"]

0 commit comments

Comments
 (0)