cmu-delphi
diff --git a/‎.github/ISSUE_TEMPLATE/feature_release.md
+30 b/‎.github/ISSUE_TEMPLATE/feature_release.md
+30
diff --git a/‎ansible/files/usafacts-params-prod.json
-7 b/‎ansible/files/usafacts-params-prod.json
-7
diff --git a/‎ansible/templates/usafacts-params-prod.json.j2
+12 b/‎ansible/templates/usafacts-params-prod.json.j2
+12
diff --git a/‎claims_hosp/delphi_claims_hosp/update_indicator.py
+13-6 b/‎claims_hosp/delphi_claims_hosp/update_indicator.py
+13-6
diff --git a/‎claims_hosp/static/02_20_uszips.csv
-33,100 b/‎claims_hosp/static/02_20_uszips.csv
-33,100
diff --git a/‎jenkins/usafacts-jenkins-test.sh
+3-1 b/‎jenkins/usafacts-jenkins-test.sh
+3-1
diff --git a/‎safegraph/delphi_safegraph/constants.py
+1-1 b/‎safegraph/delphi_safegraph/constants.py
+1-1
diff --git a/‎safegraph/delphi_safegraph/process.py
+136-42 b/‎safegraph/delphi_safegraph/process.py
+136-42
@@ -0,0 +1,30 @@
+---
+name: Feature release 
+about: Begin the finishing work for features ready to be included in a release
+title: 'Release NEW_THING'
+labels: 'release'
+assignees: 'benjaminysmith'
+---
+
+- [Link to issue]()
+- [Link to PR]()
+- Proposed release version: <!-- eg 1.12 -->
+
+<!-- Additional information about the feature: -->
+
+
+<!-- relevant for most work -->
+
+- [ ] API [documentation](https://github.com/cmu-delphi/delphi-epidata/tree/main/docs/api) and/or [changelog](https://github.com/cmu-delphi/delphi-epidata/blob/main/docs/api/covidcast_changelog.md)
+- [ ] API mailing list notification
+
+<!-- relevant for new signals -->
+
+- [ ] Statistical review (usually [correlations](https://github.com/cmu-delphi/covidcast/tree/main/docs/R-notebooks))
+- [ ] Signal / source name review (usually [Roni](https://docs.google.com/document/d/10hGd4Evce4lJ4VkWaQEKFQxvmw2P4xyYGtIAWF52Sf8/edit?usp=sharing))
+
+<!-- relevant for new map signals -->
+
+- [ ] Visual review
+- [ ] [Signal description pop-up text](https://docs.google.com/document/d/1kDqRg8EaI4WQXMaUUbbCGPlsUqEql8kgXCNt6AvMA9I/edit?usp=sharing) review
+- [ ] [Map release notes](https://docs.google.com/document/d/1BpxGgIma_Lkd2kxtwEo2DBdHQ3zk6dHRz-leUIRlOIA/edit?usp=sharing)
@@ -0,0 +1,12 @@
+{
+  "export_start_date": "latest",
+  "static_file_dir": "./static",
+  "export_dir": "/common/covidcast/receiving/usa-facts",
+  "cache_dir": "./cache",
+  "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv",
+  "aws_credentials": {
+    "aws_access_key_id": "{{ delphi_aws_access_key_id }}",
+    "aws_secret_access_key": "{{ delphi_aws_secret_access_key }}"
+  },
+  "bucket_name": "delphi-covidcast-indicator-output"
+}
@@ -96,13 +96,21 @@ def geo_reindex(self, data):
         """
         geo_map = GeoMapper()
         if self.geo == "county":
-            data_frame = geo_map.county_to_megacounty(
-                data, Config.MIN_DEN, Config.MAX_BACKWARDS_PAD_LENGTH,
-                thr_col="den", mega_col=self.geo)
+            data_frame = geo_map.fips_to_megacounty(data,
+                                                    Config.MIN_DEN,
+                                                    Config.MAX_BACKWARDS_PAD_LENGTH,
+                                                    thr_col="den",
+                                                    mega_col=self.geo)
         elif self.geo == "state":
-            data_frame = geo_map.county_to_state(data, state_id_col=self.geo)
+            data_frame = geo_map.replace_geocode(data,
+                                                 from_code="fips",
+                                                 new_col=self.geo,
+                                                 new_code="state_id")
+            data_frame[self.geo] = data_frame[self.geo]
         elif self.geo == "msa":
-            data_frame = geo_map.county_to_msa(data, msa_col=self.geo)
+            data_frame = geo_map.replace_geocode(data,
+                                                 from_code="fips",
+                                                 new_code=self.geo)
         elif self.geo == "hrr":
             data_frame = data  # data is already adjusted in aggregation step above
         else:
@@ -119,7 +127,6 @@ def geo_reindex(self, data):
         assert (
                 len(multiindex) <= (GeoConstants.MAX_GEO[self.geo] * len(self.fit_dates))
         ), "more loc-date pairs than maximum number of geographies x number of dates"
-
         # fill dataframe with missing dates using 0
         data_frame = data_frame.reindex(multiindex, fill_value=0)
         data_frame.fillna(0, inplace=True)
 
@@ -15,7 +15,9 @@ local_indicator="usafacts"
 cd "${WORKSPACE}/${local_indicator}" || exit
 
 # Linter
-env/bin/pylint delphi_"${local_indicator}"
+#env/bin/pylint delphi_"${local_indicator}"
+echo "Skip linting because we have weird breakage :( \
+  TODO: https://github.com/cmu-delphi/covidcast-indicators/issues/333"
 
 # Unit tests and code coverage
 cd tests || exit && \
 
@@ -1,4 +1,4 @@
-
+"""Constants for constructing Safegraph indicator."""
 
 HOME_DWELL = 'median_home_dwell_time'
 COMPLETELY_HOME = 'completely_home_prop'
 
@@ -1,14 +1,59 @@
-import covidcast
+"""Internal functions for creating Safegraph indicator."""
+import datetime
+import os
+from typing import List
 import numpy as np
 import pandas as pd
+import covidcast
 
 from delphi_utils import GeoMapper
 
-from .constants import HOME_DWELL, COMPLETELY_HOME, FULL_TIME_WORK, PART_TIME_WORK
+from .constants import HOME_DWELL, COMPLETELY_HOME, FULL_TIME_WORK, PART_TIME_WORK, GEO_RESOLUTIONS
 
 # Magic number for modular arithmetic; CBG -> FIPS
 MOD = 10000000
 
+# Base file name for raw data CSVs.
+CSV_NAME = 'social-distancing.csv.gz'
+
+def validate(df):
+    """Confirms that a data frame has only one date."""
+    timestamps = df['date_range_start'].apply(date_from_timestamp)
+    assert len(timestamps.unique()) == 1
+
+
+def date_from_timestamp(timestamp) -> datetime.date:
+    """Extracts the date from a timestamp beginning with {YYYY}-{MM}-{DD}T."""
+    return datetime.date.fromisoformat(timestamp.split('T')[0])
+
+
+def files_in_past_week(current_filename) -> List[str]:
+    """Constructs file paths from previous 6 days.
+    Parameters
+    ----------
+    current_filename: str
+        name of CSV file.  Must be of the form
+        {path}/{YYYY}/{MM}/{DD}/{YYYY}-{MM}-{DD}-{CSV_NAME}
+    Returns
+    -------
+    List of file names corresponding to the 6 days prior to YYYY-MM-DD.
+    """
+    path, year, month, day, _ = current_filename.rsplit('/', 4)
+    current_date = datetime.date(int(year), int(month), int(day))
+    one_day = datetime.timedelta(days=1)
+    for _ in range(1, 7):
+        current_date = current_date - one_day
+        date_str = current_date.isoformat()
+        date_path = date_str.replace('-', '/')
+        new_filename = f'{path}/{date_path}/{date_str}-{CSV_NAME}'
+        yield new_filename
+
+
+def add_suffix(signals, suffix):
+    """Adds `suffix` to every element of `signals`."""
+    return [s + suffix for s in signals]
+
+
 def add_prefix(signal_names, wip_signal, prefix: str):
     """Adds prefix to signal if there is a WIP signal
     Parameters
@@ -43,7 +88,7 @@ def add_prefix(signal_names, wip_signal, prefix: str):
         ]
     raise ValueError("Supply True | False or '' or [] | list()")
 
-# Check if the signal name is public
+
 def public_signal(signal_):
     """Checks if the signal name is already public using COVIDcast
     Parameters
@@ -90,32 +135,29 @@ def construct_signals(cbg_df, signal_names):
     """
 
     # Preparation
-    cbg_df['timestamp'] = cbg_df['date_range_start'].apply(
-        lambda x: str(x).split('T')[0])
     cbg_df['county_fips'] = (cbg_df['origin_census_block_group'] // MOD).apply(
         lambda x: f'{int(x):05d}')
 
     # Transformation: create signal not available in raw data
     for signal in signal_names:
-        if signal.endswith(FULL_TIME_WORK):
+        if FULL_TIME_WORK in signal:
             cbg_df[signal] = (cbg_df['full_time_work_behavior_devices']
                               / cbg_df['device_count'])
-        elif signal.endswith(COMPLETELY_HOME):
+        elif COMPLETELY_HOME in signal:
             cbg_df[signal] = (cbg_df['completely_home_device_count']
                               / cbg_df['device_count'])
-        elif signal.endswith(PART_TIME_WORK):
+        elif PART_TIME_WORK in signal:
             cbg_df[signal] = (cbg_df['part_time_work_behavior_devices']
                               / cbg_df['device_count'])
-        elif signal.endswith(HOME_DWELL):
+        elif HOME_DWELL in signal:
             cbg_df[signal] = (cbg_df['median_home_dwell_time'])
 
-
     # Subsetting
-    return cbg_df[['timestamp', 'county_fips'] + signal_names]
+    return cbg_df[['county_fips'] + signal_names]
 
 
 def aggregate(df, signal_names, geo_resolution='county'):
-    '''Aggregate signals to appropriate resolution and produce standard errors.
+    """Aggregate signals to appropriate resolution and produce standard errors.
     Parameters
     ----------
     df: pd.DataFrame
@@ -130,9 +172,8 @@ def aggregate(df, signal_names, geo_resolution='county'):
     pd.DataFrame:
         DataFrame with one row per geo_id, with columns for the individual
         signals, standard errors, and sample sizes.
-    '''
+    """
     # Prepare geo resolution
-    GEO_RESOLUTION = ('county', 'state')
     if geo_resolution == 'county':
         df['geo_id'] = df['county_fips']
     elif geo_resolution == 'state':
@@ -144,18 +185,14 @@ def aggregate(df, signal_names, geo_resolution='county'):
                               new_col='geo_id',
                               dropna=False)
     else:
-        raise ValueError(f'`geo_resolution` must be one of {GEO_RESOLUTION}.')
+        raise ValueError(
+            f'`geo_resolution` must be one of {GEO_RESOLUTIONS}.')
 
     # Aggregation and signal creation
-    df_mean = df.groupby(['geo_id', 'timestamp'])[
-        signal_names
-    ].mean()
-    df_sd = df.groupby(['geo_id', 'timestamp'])[
-        signal_names
-    ].std()
-    df_n = df.groupby(['geo_id', 'timestamp'])[
-        signal_names
-    ].count()
+    grouped_df = df.groupby(['geo_id'])[signal_names]
+    df_mean = grouped_df.mean()
+    df_sd = grouped_df.std()
+    df_n = grouped_df.count()
     agg_df = pd.DataFrame.join(df_mean, df_sd,
                                lsuffix='_mean', rsuffix='_sd')
     agg_df = pd.DataFrame.join(agg_df, df_n.rename({
@@ -167,39 +204,96 @@ def aggregate(df, signal_names, geo_resolution='county'):
     return agg_df.reset_index()
 
 
-def process(fname, signal_names, geo_resolutions, export_dir):
-    '''Process an input census block group-level CSV and export it.  Assumes
-    that the input file has _only_ one date of data.
+def process_window(df_list: List[pd.DataFrame],
+                   signal_names: List[str],
+                   geo_resolutions: List[str],
+                   export_dir: str):
+    """Processes a list of input census block group-level data frames as a
+    single data set and exports it.  Assumes each data frame has _only_ one
+    date of data.
     Parameters
     ----------
-    export_dir
-        path where the output files are saved
-    signal_names : List[str]
+    cbg_df: pd.DataFrame
+        list of census block group-level frames.
+    signal_names: List[str]
         signal names to be processed
-    fname: str
-        Input filename.
     geo_resolutions: List[str]
         List of geo resolutions to export the data.
+    export_dir
+        path where the output files are saved
     Returns
     -------
-    None
-    '''
-    cbg_df = construct_signals(pd.read_csv(fname), signal_names)
-    unique_date = cbg_df['timestamp'].unique()
-    if len(unique_date) != 1:
-        raise ValueError(f'More than one timestamp found in input file {fname}.')
-    date = unique_date[0].replace('-', '')
+    None.  One file is written per (signal, resolution) pair containing the
+    aggregated data from `df`.
+    """
+    for df in df_list:
+        validate(df)
+    date = date_from_timestamp(df_list[0].at[0, 'date_range_start'])
+    cbg_df = pd.concat(construct_signals(df, signal_names) for df in df_list)
     for geo_res in geo_resolutions:
-        df = aggregate(cbg_df, signal_names, geo_res)
+        aggregated_df = aggregate(cbg_df, signal_names, geo_res)
         for signal in signal_names:
-            df_export = df[
+            df_export = aggregated_df[
                 ['geo_id']
                 + [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
-                ].rename({
+            ].rename({
                 f'{signal}_mean': 'val',
                 f'{signal}_se': 'se',
                 f'{signal}_n': 'sample_size',
             }, axis=1)
             df_export.to_csv(f'{export_dir}/{date}_{geo_res}_{signal}.csv',
                              na_rep='NA',
                              index=False, )
+
+
+def process(current_filename: str,
+            previous_filenames: List[str],
+            signal_names: List[str],
+            wip_signal,
+            geo_resolutions: List[str],
+            export_dir: str):
+    """Creates and exports signals corresponding both to a single day as well
+    as averaged over the previous week.
+    Parameters
+    ----------
+    current_filename: str
+        path to file holding the target date's data.
+    previous_filenames: List[str]
+        paths to files holding data from each day in the week preceding the
+        target date.
+    signal_names: List[str]
+        signal names to be processed for a single date.
+        A second version of each such signal named {SIGNAL}_7d_avg will be
+        created averaging {SIGNAL} over the past 7 days.
+    wip_signal : List[str] or bool
+        a list of wip signals: [], OR
+        all signals in the registry: True OR
+        only signals that have never been published: False
+    geo_resolutions: List[str]
+        List of geo resolutions to export the data.
+    export_dir
+        path where the output files are saved.
+    Returns
+    -------
+    None.  For each (signal, resolution) pair, one file is written for the
+    single date values to {export_dir}/{date}_{resolution}_{signal}.csv and
+    one for the data averaged over the previous week to
+    {export_dir}/{date}_{resolution}_{signal}_7d_avg.csv.
+    """
+    past_week = [pd.read_csv(current_filename)]
+    for fname in previous_filenames:
+        if os.path.exists(fname):
+            past_week.append(pd.read_csv(fname))
+
+    # First process the current file alone...
+    process_window(past_week[:1],
+                   add_prefix(signal_names, wip_signal, 'wip_'),
+                   geo_resolutions,
+                   export_dir)
+    # ...then as part of the whole window.
+    process_window(past_week,
+                  add_prefix(add_suffix(signal_names, '_7d_avg'),
+                             wip_signal,
+                             'wip_'),
+                  geo_resolutions,
+                  export_dir)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-`
	`1`	`+"""Constants for constructing Safegraph indicator."""`
`2`	`2`
`3`	`3`	`HOME_DWELL = 'median_home_dwell_time'`
`4`	`4`	`COMPLETELY_HOME = 'completely_home_prop'`