Nans google:

dshemetov · dshemetov · commit 35c13855d16b · 2021-11-10T12:22:00.000-08:00
* add missing columns
diff --git a/google_symptoms/delphi_google_symptoms/run.py b/google_symptoms/delphi_google_symptoms/run.py
@@ -10,11 +10,12 @@
 import covidcast
 
 import numpy as np
-from pandas import to_datetime
+import pandas as pd
 from delphi_utils import (
     create_export_csv,
     geomap,
-    get_structured_logger
+    get_structured_logger,
+    Nans
 )
 from delphi_utils.validator.utils import lag_converter
 
@@ -24,6 +25,26 @@
 from .pull import pull_gs_data
 
 
+def add_nancodes(df, smoother):
+    """Add nancodes to the dataframe."""
+    idx = pd.IndexSlice
+
+    # Default missingness codes
+    df["missing_val"] = Nans.NOT_MISSING
+    df["missing_se"] = Nans.NOT_APPLICABLE
+    df["missing_sample_size"] = Nans.NOT_APPLICABLE
+
+    # Mark early smoothing entries as data insufficient
+    if smoother == "smoothed":
+        df.sort_index(inplace=True)
+        min_time_value = df.index.min()[0] + 5 * pd.Timedelta(days=1)
+        df.loc[idx[:min_time_value, :], "missing_val"] = Nans.CENSORED
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = df["val"].isnull() & df["missing_val"].eq(Nans.NOT_MISSING)
+    df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER
+    return df
+
 def run_module(params):
     """
     Run Google Symptoms module.
@@ -71,7 +92,7 @@ def run_module(params):
         # Select the larger number of days. Prevents validator from complaining about missing dates,
         # and backfills in case of an outage.
         num_export_days = max(
-            (datetime.today() - to_datetime(min(gs_metadata.max_time))).days + 1,
+            (datetime.today() - pd.to_datetime(min(gs_metadata.max_time))).days + 1,
             params["validation"]["common"].get("span_length", 14) + global_max_expected_lag
             )
 
@@ -108,8 +129,7 @@ def run_module(params):
                                            ).transform(SMOOTHERS_MAP[smoother][0])
             df["se"] = np.nan
             df["sample_size"] = np.nan
-            # Drop early entries where data insufficient for smoothing
-            df = df.loc[~df["val"].isnull(), :]
+            df = add_nancodes(df, smoother)
             df = df.reset_index()
             sensor_name = "_".join([smoother, "search"])
 
@@ -121,7 +141,9 @@ def run_module(params):
                 start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                 metric=metric.lower(),
                 geo_res=geo_res,
-                sensor=sensor_name)
+                sensor=sensor_name,
+                logger=logger)
+
             if not exported_csv_dates.empty:
                 logger.info("Exported CSV",
                             csv_export_count=exported_csv_dates.size,
diff --git a/google_symptoms/tests/test_run.py b/google_symptoms/tests/test_run.py
@@ -7,35 +7,25 @@
 
 class TestRun:
     def test_output_files_exist(self, run_as_module):
-        csv_files = listdir("receiving")
+        csv_files = set(listdir("receiving"))
 
-        dates = [
-            "20200801",
-            "20200802",
-            "20200803",
-            "20200804",
-            "20200805",
-            "20200806",
-            "20200807",
-            "20200808",
-            "20200809",
-            "20200810",
-            "20200811"
-        ]
-        geos = ["county", "state", "hhs", "nation"]
+        dates = [d.strftime("%Y%m%d") for d in pd.date_range("20200726", "20200811")]
+        geos = ["county", "state", "hhs", "msa", "hrr", "nation"]
         metrics = ["anosmia", "ageusia", "sum_anosmia_ageusia"]
         smoother = ["raw", "smoothed"]
 
-        expected_files = []
-        for date, geo, metric, smoother in product(dates, geos, metrics, smoother):
-            nf = "_".join([date, geo, metric, smoother, "research"]) + ".csv"
-            expected_files.append(nf)
+        expected_files = {
+            f"{date}_{geo}_{metric}_{smoother}_search.csv"
+            for date, geo, metric, smoother in product(dates, geos, metrics, smoother)
+        }
 
-        set(csv_files) == set(expected_files)
+        assert csv_files == expected_files
 
-    def test_output_file_format(self, run_as_module):
         df = pd.read_csv(
             join("receiving", "20200810_state_anosmia_smoothed_search.csv")
         )
-        assert (df.columns.values == [
-                "geo_id", "val", "se", "sample_size"]).all()
+        expected_columns = [
+            "geo_id", "val", "se", "sample_size",
+            "missing_val", "missing_se", "missing_sample_size"
+        ]
+        assert (df.columns.values == expected_columns).all()
diff --git a/google_symptoms/tests/test_smooth.py b/google_symptoms/tests/test_smooth.py
@@ -23,5 +23,5 @@ def test_output_files_smoothed(self, run_as_module):
         raw = raw.groupby('geo_id')['val'].sum()/7.0
         df = pd.merge(smoothed, raw, on='geo_id',
                       suffixes=('_smoothed', '_raw'))
-
+        df = df.dropna(subset=["val_smoothed"])
         assert np.allclose(df['val_smoothed'].values, df['val_raw'].values)