Skip to content

Commit 35c1385

Browse files
committed
Nans google:
* add missing columns
1 parent 0b7103a commit 35c1385

File tree

3 files changed

+42
-30
lines changed

3 files changed

+42
-30
lines changed

Diff for: google_symptoms/delphi_google_symptoms/run.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
import covidcast
1111

1212
import numpy as np
13-
from pandas import to_datetime
13+
import pandas as pd
1414
from delphi_utils import (
1515
create_export_csv,
1616
geomap,
17-
get_structured_logger
17+
get_structured_logger,
18+
Nans
1819
)
1920
from delphi_utils.validator.utils import lag_converter
2021

@@ -24,6 +25,26 @@
2425
from .pull import pull_gs_data
2526

2627

28+
def add_nancodes(df, smoother):
29+
"""Add nancodes to the dataframe."""
30+
idx = pd.IndexSlice
31+
32+
# Default missingness codes
33+
df["missing_val"] = Nans.NOT_MISSING
34+
df["missing_se"] = Nans.NOT_APPLICABLE
35+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
36+
37+
# Mark early smoothing entries as data insufficient
38+
if smoother == "smoothed":
39+
df.sort_index(inplace=True)
40+
min_time_value = df.index.min()[0] + 5 * pd.Timedelta(days=1)
41+
df.loc[idx[:min_time_value, :], "missing_val"] = Nans.CENSORED
42+
43+
# Mark any remaining nans with unknown
44+
remaining_nans_mask = df["val"].isnull() & df["missing_val"].eq(Nans.NOT_MISSING)
45+
df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER
46+
return df
47+
2748
def run_module(params):
2849
"""
2950
Run Google Symptoms module.
@@ -71,7 +92,7 @@ def run_module(params):
7192
# Select the larger number of days. Prevents validator from complaining about missing dates,
7293
# and backfills in case of an outage.
7394
num_export_days = max(
74-
(datetime.today() - to_datetime(min(gs_metadata.max_time))).days + 1,
95+
(datetime.today() - pd.to_datetime(min(gs_metadata.max_time))).days + 1,
7596
params["validation"]["common"].get("span_length", 14) + global_max_expected_lag
7697
)
7798

@@ -108,8 +129,7 @@ def run_module(params):
108129
).transform(SMOOTHERS_MAP[smoother][0])
109130
df["se"] = np.nan
110131
df["sample_size"] = np.nan
111-
# Drop early entries where data insufficient for smoothing
112-
df = df.loc[~df["val"].isnull(), :]
132+
df = add_nancodes(df, smoother)
113133
df = df.reset_index()
114134
sensor_name = "_".join([smoother, "search"])
115135

@@ -121,7 +141,9 @@ def run_module(params):
121141
start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
122142
metric=metric.lower(),
123143
geo_res=geo_res,
124-
sensor=sensor_name)
144+
sensor=sensor_name,
145+
logger=logger)
146+
125147
if not exported_csv_dates.empty:
126148
logger.info("Exported CSV",
127149
csv_export_count=exported_csv_dates.size,

Diff for: google_symptoms/tests/test_run.py

+13-23
Original file line numberDiff line numberDiff line change
@@ -7,35 +7,25 @@
77

88
class TestRun:
99
def test_output_files_exist(self, run_as_module):
10-
csv_files = listdir("receiving")
10+
csv_files = set(listdir("receiving"))
1111

12-
dates = [
13-
"20200801",
14-
"20200802",
15-
"20200803",
16-
"20200804",
17-
"20200805",
18-
"20200806",
19-
"20200807",
20-
"20200808",
21-
"20200809",
22-
"20200810",
23-
"20200811"
24-
]
25-
geos = ["county", "state", "hhs", "nation"]
12+
dates = [d.strftime("%Y%m%d") for d in pd.date_range("20200726", "20200811")]
13+
geos = ["county", "state", "hhs", "msa", "hrr", "nation"]
2614
metrics = ["anosmia", "ageusia", "sum_anosmia_ageusia"]
2715
smoother = ["raw", "smoothed"]
2816

29-
expected_files = []
30-
for date, geo, metric, smoother in product(dates, geos, metrics, smoother):
31-
nf = "_".join([date, geo, metric, smoother, "research"]) + ".csv"
32-
expected_files.append(nf)
17+
expected_files = {
18+
f"{date}_{geo}_{metric}_{smoother}_search.csv"
19+
for date, geo, metric, smoother in product(dates, geos, metrics, smoother)
20+
}
3321

34-
set(csv_files) == set(expected_files)
22+
assert csv_files == expected_files
3523

36-
def test_output_file_format(self, run_as_module):
3724
df = pd.read_csv(
3825
join("receiving", "20200810_state_anosmia_smoothed_search.csv")
3926
)
40-
assert (df.columns.values == [
41-
"geo_id", "val", "se", "sample_size"]).all()
27+
expected_columns = [
28+
"geo_id", "val", "se", "sample_size",
29+
"missing_val", "missing_se", "missing_sample_size"
30+
]
31+
assert (df.columns.values == expected_columns).all()

Diff for: google_symptoms/tests/test_smooth.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,5 @@ def test_output_files_smoothed(self, run_as_module):
2323
raw = raw.groupby('geo_id')['val'].sum()/7.0
2424
df = pd.merge(smoothed, raw, on='geo_id',
2525
suffixes=('_smoothed', '_raw'))
26-
26+
df = df.dropna(subset=["val_smoothed"])
2727
assert np.allclose(df['val_smoothed'].values, df['val_raw'].values)

0 commit comments

Comments
 (0)