Skip to content

Commit 3db576f

Browse files
committed
Nans google:
* add missing columns
1 parent 4ef3389 commit 3db576f

File tree

3 files changed

+32
-7
lines changed

3 files changed

+32
-7
lines changed

Diff for: google_symptoms/delphi_google_symptoms/run.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010
import covidcast
1111

1212
import numpy as np
13+
import pandas as pd
1314
from delphi_utils import (
1415
create_export_csv,
1516
geomap,
16-
get_structured_logger
17+
get_structured_logger,
18+
Nans
1719
)
1820

1921
from .constants import (METRICS, COMBINED_METRIC,
@@ -22,6 +24,26 @@
2224
from .pull import pull_gs_data
2325

2426

27+
def add_nancodes(df, smoother):
28+
"""Add nancodes to the dataframe."""
29+
idx = pd.IndexSlice
30+
31+
# Default missingness codes
32+
df["missing_val"] = Nans.NOT_MISSING
33+
df["missing_se"] = Nans.NOT_APPLICABLE
34+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
35+
36+
# Mark early smoothing entries as data insufficient
37+
if smoother == "smoothed":
38+
df.sort_index(inplace=True)
39+
min_time_value = df.index.min()[0] + 5 * pd.Timedelta(days=1)
40+
df.loc[idx[:min_time_value, :], "missing_val"] = Nans.PRIVACY
41+
42+
# Mark any remaining nans with unknown
43+
remaining_nans_mask = df["val"].isnull() & df["missing_val"].eq(Nans.NOT_MISSING)
44+
df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
45+
return df
46+
2547
def run_module(params):
2648
"""
2749
Run Google Symptoms module.
@@ -92,8 +114,7 @@ def run_module(params):
92114
).transform(SMOOTHERS_MAP[smoother][0])
93115
df["se"] = np.nan
94116
df["sample_size"] = np.nan
95-
# Drop early entries where data insufficient for smoothing
96-
df = df.loc[~df["val"].isnull(), :]
117+
df = add_nancodes(df, smoother)
97118
df = df.reset_index()
98119
sensor_name = "_".join([smoother, "search"])
99120

@@ -105,7 +126,8 @@ def run_module(params):
105126
start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
106127
metric=metric.lower(),
107128
geo_res=geo_res,
108-
sensor=sensor_name)
129+
sensor=sensor_name,
130+
logger=logger)
109131

110132
if not exported_csv_dates.empty:
111133
csv_export_count += exported_csv_dates.size

Diff for: google_symptoms/tests/test_run.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,8 @@ def test_output_file_format(self, run_as_module):
3737
df = pd.read_csv(
3838
join("receiving", "20200810_state_anosmia_smoothed_search.csv")
3939
)
40-
assert (df.columns.values == [
41-
"geo_id", "val", "se", "sample_size"]).all()
40+
expected_columns = [
41+
"geo_id", "val", "se", "sample_size",
42+
"missing_val", "missing_se", "missing_sample_size"
43+
]
44+
assert (df.columns.values == expected_columns).all()

Diff for: google_symptoms/tests/test_smooth.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,5 @@ def test_output_files_smoothed(self, run_as_module):
2323
raw = raw.groupby('geo_id')['val'].sum()/7.0
2424
df = pd.merge(smoothed, raw, on='geo_id',
2525
suffixes=('_smoothed', '_raw'))
26-
26+
df = df.dropna(subset=["val_smoothed"])
2727
assert np.allclose(df['val_smoothed'].values, df['val_raw'].values)

0 commit comments

Comments
 (0)