1
- import covidcast
1
+ """Internal functions for creating Safegraph indicator."""
2
+ import datetime
3
+ import os
4
+ from typing import List
2
5
import numpy as np
3
6
import pandas as pd
7
+ import covidcast
4
8
5
9
from .constants import HOME_DWELL , COMPLETELY_HOME , FULL_TIME_WORK , PART_TIME_WORK
6
- from .geo import FIPS_TO_STATE
10
+ from .geo import FIPS_TO_STATE , VALID_GEO_RESOLUTIONS
7
11
8
12
# Magic number for modular arithmetic; CBG -> FIPS
9
13
MOD = 10000000
10
14
15
+ # Base file name for raw data CSVs.
16
+ CSV_NAME = 'social-distancing.csv.gz'
17
+
18
+ def validate (df ):
19
+ """Confirms that a data frame has only one date."""
20
+ timestamps = df ['date_range_start' ].apply (date_from_timestamp )
21
+ assert len (timestamps .unique ()) == 1
22
+
23
+
24
+ def date_from_timestamp (timestamp ) -> datetime .date :
25
+ """Extracts the date from a timestamp beginning with {YYYY}-{MM}-{DD}T."""
26
+ return datetime .date .fromisoformat (timestamp .split ('T' )[0 ])
27
+
28
+
29
+ def files_in_past_week (current_filename ) -> List [str ]:
30
+ """Constructs file paths from previous 6 days.
31
+ Parameters
32
+ ----------
33
+ current_filename: str
34
+ name of CSV file. Must be of the form
35
+ {path}/{YYYY}/{MM}/{DD}/{YYYY}-{MM}-{DD}-{CSV_NAME}
36
+ Returns
37
+ -------
38
+ List of file names corresponding to the 6 days prior to YYYY-MM-DD.
39
+ """
40
+ path , year , month , day , _ = current_filename .rsplit ('/' , 4 )
41
+ current_date = datetime .date (int (year ), int (month ), int (day ))
42
+ one_day = datetime .timedelta (days = 1 )
43
+ for _ in range (1 , 7 ):
44
+ current_date = current_date - one_day
45
+ date_str = current_date .isoformat ()
46
+ date_path = date_str .replace ('-' , '/' )
47
+ new_filename = f'{ path } /{ date_path } /{ date_str } -{ CSV_NAME } '
48
+ yield new_filename
49
+
50
+
51
+ def add_suffix (signals , suffix ):
52
+ """Adds `suffix` to every element of `signals`."""
53
+ return [s + suffix for s in signals ]
54
+
55
+
11
56
def add_prefix (signal_names , wip_signal , prefix : str ):
12
57
"""Adds prefix to signal if there is a WIP signal
13
58
Parameters
@@ -42,7 +87,7 @@ def add_prefix(signal_names, wip_signal, prefix: str):
42
87
]
43
88
raise ValueError ("Supply True | False or '' or [] | list()" )
44
89
45
- # Check if the signal name is public
90
+
46
91
def public_signal (signal_ ):
47
92
"""Checks if the signal name is already public using COVIDcast
48
93
Parameters
@@ -89,32 +134,29 @@ def construct_signals(cbg_df, signal_names):
89
134
"""
90
135
91
136
# Preparation
92
- cbg_df ['timestamp' ] = cbg_df ['date_range_start' ].apply (
93
- lambda x : str (x ).split ('T' )[0 ])
94
137
cbg_df ['county_fips' ] = (cbg_df ['origin_census_block_group' ] // MOD ).apply (
95
138
lambda x : f'{ int (x ):05d} ' )
96
139
97
140
# Transformation: create signal not available in raw data
98
141
for signal in signal_names :
99
- if signal . endswith ( FULL_TIME_WORK ) :
142
+ if FULL_TIME_WORK in signal :
100
143
cbg_df [signal ] = (cbg_df ['full_time_work_behavior_devices' ]
101
144
/ cbg_df ['device_count' ])
102
- elif signal . endswith ( COMPLETELY_HOME ) :
145
+ elif COMPLETELY_HOME in signal :
103
146
cbg_df [signal ] = (cbg_df ['completely_home_device_count' ]
104
147
/ cbg_df ['device_count' ])
105
- elif signal . endswith ( PART_TIME_WORK ) :
148
+ elif PART_TIME_WORK in signal :
106
149
cbg_df [signal ] = (cbg_df ['part_time_work_behavior_devices' ]
107
150
/ cbg_df ['device_count' ])
108
- elif signal . endswith ( HOME_DWELL ) :
151
+ elif HOME_DWELL in signal :
109
152
cbg_df [signal ] = (cbg_df ['median_home_dwell_time' ])
110
153
111
-
112
154
# Subsetting
113
- return cbg_df [['timestamp' , ' county_fips' ] + signal_names ]
155
+ return cbg_df [['county_fips' ] + signal_names ]
114
156
115
157
116
158
def aggregate (df , signal_names , geo_resolution = 'county' ):
117
- ''' Aggregate signals to appropriate resolution and produce standard errors.
159
+ """ Aggregate signals to appropriate resolution and produce standard errors.
118
160
Parameters
119
161
----------
120
162
df: pd.DataFrame
@@ -129,27 +171,22 @@ def aggregate(df, signal_names, geo_resolution='county'):
129
171
pd.DataFrame:
130
172
DataFrame with one row per geo_id, with columns for the individual
131
173
signals, standard errors, and sample sizes.
132
- '''
174
+ """
133
175
# Prepare geo resolution
134
- GEO_RESOLUTION = ('county' , 'state' )
135
176
if geo_resolution == 'county' :
136
177
df ['geo_id' ] = df ['county_fips' ]
137
178
elif geo_resolution == 'state' :
138
179
df ['geo_id' ] = df ['county_fips' ].apply (lambda x :
139
180
FIPS_TO_STATE [x [:2 ]])
140
181
else :
141
- raise ValueError (f'`geo_resolution` must be one of { GEO_RESOLUTION } .' )
182
+ raise ValueError (
183
+ f'`geo_resolution` must be one of { VALID_GEO_RESOLUTIONS } .' )
142
184
143
185
# Aggregation and signal creation
144
- df_mean = df .groupby (['geo_id' , 'timestamp' ])[
145
- signal_names
146
- ].mean ()
147
- df_sd = df .groupby (['geo_id' , 'timestamp' ])[
148
- signal_names
149
- ].std ()
150
- df_n = df .groupby (['geo_id' , 'timestamp' ])[
151
- signal_names
152
- ].count ()
186
+ grouped_df = df .groupby (['geo_id' ])[signal_names ]
187
+ df_mean = grouped_df .mean ()
188
+ df_sd = grouped_df .std ()
189
+ df_n = grouped_df .count ()
153
190
agg_df = pd .DataFrame .join (df_mean , df_sd ,
154
191
lsuffix = '_mean' , rsuffix = '_sd' )
155
192
agg_df = pd .DataFrame .join (agg_df , df_n .rename ({
@@ -161,39 +198,96 @@ def aggregate(df, signal_names, geo_resolution='county'):
161
198
return agg_df .reset_index ()
162
199
163
200
164
- def process (fname , signal_names , geo_resolutions , export_dir ):
165
- '''Process an input census block group-level CSV and export it. Assumes
166
- that the input file has _only_ one date of data.
201
+ def process_window (df_list : List [pd .DataFrame ],
202
+ signal_names : List [str ],
203
+ geo_resolutions : List [str ],
204
+ export_dir : str ):
205
+ """Processes a list of input census block group-level data frames as a
206
+ single data set and exports it. Assumes each data frame has _only_ one
207
+ date of data.
167
208
Parameters
168
209
----------
169
- export_dir
170
- path where the output files are saved
171
- signal_names : List[str]
210
+ cbg_df: pd.DataFrame
211
+ list of census block group-level frames.
212
+ signal_names: List[str]
172
213
signal names to be processed
173
- fname: str
174
- Input filename.
175
214
geo_resolutions: List[str]
176
215
List of geo resolutions to export the data.
216
+ export_dir
217
+ path where the output files are saved
177
218
Returns
178
219
-------
179
- None
180
- '''
181
- cbg_df = construct_signals ( pd . read_csv ( fname ), signal_names )
182
- unique_date = cbg_df [ 'timestamp' ]. unique ()
183
- if len ( unique_date ) != 1 :
184
- raise ValueError ( f'More than one timestamp found in input file { fname } .' )
185
- date = unique_date [ 0 ]. replace ( '-' , '' )
220
+ None. One file is written per (signal, resolution) pair containing the
221
+ aggregated data from `df`.
222
+ """
223
+ for df in df_list :
224
+ validate ( df )
225
+ date = date_from_timestamp ( df_list [ 0 ]. at [ 0 , 'date_range_start' ] )
226
+ cbg_df = pd . concat ( construct_signals ( df , signal_names ) for df in df_list )
186
227
for geo_res in geo_resolutions :
187
- df = aggregate (cbg_df , signal_names , geo_res )
228
+ aggregated_df = aggregate (cbg_df , signal_names , geo_res )
188
229
for signal in signal_names :
189
- df_export = df [
230
+ df_export = aggregated_df [
190
231
['geo_id' ]
191
232
+ [f'{ signal } _{ x } ' for x in ('mean' , 'se' , 'n' )]
192
- ].rename ({
233
+ ].rename ({
193
234
f'{ signal } _mean' : 'val' ,
194
235
f'{ signal } _se' : 'se' ,
195
236
f'{ signal } _n' : 'sample_size' ,
196
237
}, axis = 1 )
197
238
df_export .to_csv (f'{ export_dir } /{ date } _{ geo_res } _{ signal } .csv' ,
198
239
na_rep = 'NA' ,
199
240
index = False , )
241
+
242
+
243
+ def process (current_filename : str ,
244
+ previous_filenames : List [str ],
245
+ signal_names : List [str ],
246
+ wip_signal ,
247
+ geo_resolutions : List [str ],
248
+ export_dir : str ):
249
+ """Creates and exports signals corresponding both to a single day as well
250
+ as averaged over the previous week.
251
+ Parameters
252
+ ----------
253
+ current_filename: str
254
+ path to file holding the target date's data.
255
+ previous_filenames: List[str]
256
+ paths to files holding data from each day in the week preceding the
257
+ target date.
258
+ signal_names: List[str]
259
+ signal names to be processed for a single date.
260
+ A second version of each such signal named {SIGNAL}_7d_avg will be
261
+ created averaging {SIGNAL} over the past 7 days.
262
+ wip_signal : List[str] or bool
263
+ a list of wip signals: [], OR
264
+ all signals in the registry: True OR
265
+ only signals that have never been published: False
266
+ geo_resolutions: List[str]
267
+ List of geo resolutions to export the data.
268
+ export_dir
269
+ path where the output files are saved.
270
+ Returns
271
+ -------
272
+ None. For each (signal, resolution) pair, one file is written for the
273
+ single date values to {export_dir}/{date}_{resolution}_{signal}.csv and
274
+ one for the data averaged over the previous week to
275
+ {export_dir}/{date}_{resolution}_{signal}_7d_avg.csv.
276
+ """
277
+ past_week = [pd .read_csv (current_filename )]
278
+ for fname in previous_filenames :
279
+ if os .path .exists (fname ):
280
+ past_week .append (pd .read_csv (fname ))
281
+
282
+ # First process the current file alone...
283
+ process_window (past_week [:1 ],
284
+ add_prefix (signal_names , wip_signal , 'wip_' ),
285
+ geo_resolutions ,
286
+ export_dir )
287
+ # ...then as part of the whole window.
288
+ process_window (past_week ,
289
+ add_prefix (add_suffix (signal_names , '_7d_avg' ),
290
+ wip_signal ,
291
+ 'wip_' ),
292
+ geo_resolutions ,
293
+ export_dir )
0 commit comments