Nans google:

dshemetov · dshemetov · commit 3db576fae7ca · 2021-04-27T12:34:06.000-07:00
* add missing columns
diff --git a/google_symptoms/delphi_google_symptoms/run.py b/google_symptoms/delphi_google_symptoms/run.py
@@ -10,10 +10,12 @@
 import covidcast
 
 import numpy as np
+import pandas as pd
 from delphi_utils import (
     create_export_csv,
     geomap,
-    get_structured_logger
+    get_structured_logger,
+    Nans
 )
 
 from .constants import (METRICS, COMBINED_METRIC,
@@ -22,6 +24,26 @@
 from .pull import pull_gs_data
 
 
+def add_nancodes(df, smoother):
+    """Add nancodes to the dataframe."""
+    idx = pd.IndexSlice
+
+    # Default missingness codes
+    df["missing_val"] = Nans.NOT_MISSING
+    df["missing_se"] = Nans.NOT_APPLICABLE
+    df["missing_sample_size"] = Nans.NOT_APPLICABLE
+
+    # Mark early smoothing entries as data insufficient
+    if smoother == "smoothed":
+        df.sort_index(inplace=True)
+        min_time_value = df.index.min()[0] + 5 * pd.Timedelta(days=1)
+        df.loc[idx[:min_time_value, :], "missing_val"] = Nans.PRIVACY
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = df["val"].isnull() & df["missing_val"].eq(Nans.NOT_MISSING)
+    df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
+    return df
+
 def run_module(params):
     """
     Run Google Symptoms module.
@@ -92,8 +114,7 @@ def run_module(params):
                                            ).transform(SMOOTHERS_MAP[smoother][0])
             df["se"] = np.nan
             df["sample_size"] = np.nan
-            # Drop early entries where data insufficient for smoothing
-            df = df.loc[~df["val"].isnull(), :]
+            df = add_nancodes(df, smoother)
             df = df.reset_index()
             sensor_name = "_".join([smoother, "search"])
 
@@ -105,7 +126,8 @@ def run_module(params):
                 start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                 metric=metric.lower(),
                 geo_res=geo_res,
-                sensor=sensor_name)
+                sensor=sensor_name,
+                logger=logger)
 
             if not exported_csv_dates.empty:
                 csv_export_count += exported_csv_dates.size
diff --git a/google_symptoms/tests/test_run.py b/google_symptoms/tests/test_run.py
@@ -37,5 +37,8 @@ def test_output_file_format(self, run_as_module):
         df = pd.read_csv(
             join("receiving", "20200810_state_anosmia_smoothed_search.csv")
         )
-        assert (df.columns.values == [
-                "geo_id", "val", "se", "sample_size"]).all()
+        expected_columns = [
+            "geo_id", "val", "se", "sample_size",
+            "missing_val", "missing_se", "missing_sample_size"
+        ]
+        assert (df.columns.values == expected_columns).all()
diff --git a/google_symptoms/tests/test_smooth.py b/google_symptoms/tests/test_smooth.py
@@ -23,5 +23,5 @@ def test_output_files_smoothed(self, run_as_module):
         raw = raw.groupby('geo_id')['val'].sum()/7.0
         df = pd.merge(smoothed, raw, on='geo_id',
                       suffixes=('_smoothed', '_raw'))
-
+        df = df.dropna(subset=["val_smoothed"])
         assert np.allclose(df['val_smoothed'].values, df['val_raw'].values)