Skip to content

Commit c9fea0a

Browse files
authored
Merge pull request #310 from cmu-delphi/geoutil_state_extension
Extend geocode utility to actually support the state to state mappings
2 parents 7f07c41 + 84422b6 commit c9fea0a

File tree

9 files changed

+70
-33205
lines changed

9 files changed

+70
-33205
lines changed

_delphi_utils_python/delphi_utils/geomap.py

+24-3
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ class GeoMapper:
7777
==========
7878
The main GeoMapper object loads and stores crosswalk dataframes on-demand.
7979
80+
When replacing geocodes with a new one an aggregation step is performed on the data columns
81+
to merge entries (i.e. in the case of a many to one mapping or a weighted mapping). This
82+
requires a specification of the data columns, which are assumed to be all the columns that
83+
are not the geocodes or the date column specified in date_col.
84+
8085
Example 1: to add a new column with a new geocode, possibly with weights:
8186
> gmpr = GeoMapper()
8287
> df = gmpr.add_geocode(df, "fips", "zip", from_col="fips", new_col="geo_id",
@@ -305,7 +310,12 @@ def add_geocode(
305310
)
306311

307312
# state codes are all stored in one table
308-
if new_code in state_codes:
313+
if from_code in state_codes and new_code in state_codes:
314+
crosswalk = self._load_crosswalk(from_code="state", to_code="state")
315+
crosswalk = crosswalk.rename(
316+
columns={from_code: from_col, new_code: new_col}
317+
)
318+
elif new_code in state_codes:
309319
crosswalk = self._load_crosswalk(from_code=from_code, to_code="state")
310320
crosswalk = crosswalk.rename(
311321
columns={from_code: from_col, new_code: new_col}
@@ -322,9 +332,13 @@ def add_geocode(
322332
df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left")
323333

324334
# Drop extra state columns
325-
if new_code in state_codes:
335+
if new_code in state_codes and not from_code in state_codes:
326336
state_codes.remove(new_code)
327337
df.drop(columns=state_codes, inplace=True)
338+
elif new_code in state_codes and from_code in state_codes:
339+
state_codes.remove(new_code)
340+
state_codes.remove(from_code)
341+
df.drop(columns=state_codes, inplace=True)
328342

329343
return df
330344

@@ -361,6 +375,9 @@ def replace_geocode(
361375
new_code: {'fips', 'zip', 'state_code', 'state_id', 'state_name', 'hrr', 'msa',
362376
'hhs_region_number'}
363377
Specifies the geocode type of the data in new_col.
378+
date_col: str or None, default "date"
379+
Specify which column contains the date values. Used for value aggregation.
380+
If None, then the aggregation is done only on geo_id.
364381
data_cols: list, default None
365382
A list of data column names to aggregate when doing a weighted coding. If set to
366383
None, then all the columns are used except for date_col and new_col.
@@ -389,7 +406,11 @@ def replace_geocode(
389406
# Multiply and aggregate (this automatically zeros NAs)
390407
df[data_cols] = df[data_cols].multiply(df["weight"], axis=0)
391408
df.drop("weight", axis=1, inplace=True)
392-
df = df.groupby([date_col, new_col]).sum().reset_index()
409+
410+
if not date_col is None:
411+
df = df.groupby([date_col, new_col]).sum().reset_index()
412+
else:
413+
df = df.groupby([new_col]).sum().reset_index()
393414
return df
394415

395416
def add_population_column(self, data, geocode_type, geocode_col=None):

_delphi_utils_python/tests/test_geomap.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -278,11 +278,13 @@ def test_zip_to_state_id(self):
278278
def test_add_population_column(self):
279279
gmpr = GeoMapper()
280280
new_data = gmpr.add_population_column(self.fips_data_3, "fips")
281-
assert new_data["population"].sum() == 274963
281+
assert new_data.shape == (5, 5)
282282
new_data = gmpr.add_population_column(self.zip_data, "zip")
283-
assert new_data["population"].sum() == 274902
283+
assert new_data.shape == (6, 5)
284284
with pytest.raises(ValueError):
285285
new_data = gmpr.add_population_column(self.zip_data, "hrr")
286+
new_data = gmpr.add_population_column(self.fips_data_5, "fips")
287+
assert new_data.shape == (4, 5)
286288

287289
def test_add_geocode(self):
288290
gmpr = GeoMapper()
@@ -382,13 +384,20 @@ def test_add_geocode(self):
382384
new_data2 = gmpr.add_geocode(new_data, "state_code", "hhs_region_number")
383385
assert new_data2["hhs_region_number"].unique().size == 2
384386

387+
# state_name -> state_id
388+
new_data = gmpr.replace_geocode(self.zip_data, "zip", "state_name")
389+
new_data2 = gmpr.add_geocode(new_data, "state_name", "state_id")
390+
assert new_data2.shape == (4, 5)
391+
new_data2 = gmpr.replace_geocode(new_data, "state_name", "state_id", new_col="abbr")
392+
assert "abbr" in new_data2.columns
393+
385394
# fips -> nation
386-
new_data = gmpr.replace_geocode(self.fips_data_5, "fips", "nation")
395+
new_data = gmpr.replace_geocode(self.fips_data_5, "fips", "nation", new_col="NATION")
387396
assert new_data.equals(
388397
pd.DataFrame().from_dict(
389398
{
390399
"date": {0: pd.Timestamp("2018-01-01 00:00:00")},
391-
"nation": {0: "us"},
400+
"NATION": {0: "us"},
392401
"count": {0: 10024.0},
393402
"total": {0: 100006.0},
394403
}
@@ -411,6 +420,23 @@ def test_add_geocode(self):
411420
)
412421
)
413422

423+
# hrr -> nation
424+
with pytest.raises(ValueError):
425+
new_data = gmpr.replace_geocode(self.zip_data, "zip", "hrr")
426+
new_data2 = gmpr.replace_geocode(new_data, "hrr", "nation")
427+
414428
# fips -> hrr (dropna=True/False check)
415429
assert not gmpr.add_geocode(self.fips_data_3, "fips", "hrr").isna().any().any()
416430
assert gmpr.add_geocode(self.fips_data_3, "fips", "hrr", dropna=False).isna().any().any()
431+
432+
# fips -> zip (date_col=None chech)
433+
new_data = gmpr.replace_geocode(self.fips_data_5.drop(columns=["date"]), "fips", "hrr", date_col=None)
434+
assert new_data.equals(
435+
pd.DataFrame().from_dict(
436+
{
437+
'hrr': {0: '1', 1: '183', 2: '184', 3: '382', 4: '7'},
438+
'count': {0: 1.772347174163783, 1: 7157.392403522299, 2: 2863.607596477701, 3: 1.0, 4: 0.22765282583621685},
439+
'total': {0: 3.544694348327566, 1: 71424.64801363471, 2: 28576.35198636529, 3: 1.0, 4: 0.4553056516724337}
440+
}
441+
)
442+
)

cdc_covidnet/delphi_cdc_covidnet/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
from . import run
1212
from . import api_config
13-
from . import geo_maps
1413
from . import update_sensor
1514
from . import covidnet
1615

cdc_covidnet/delphi_cdc_covidnet/geo_maps.py

-45
This file was deleted.

cdc_covidnet/delphi_cdc_covidnet/run.py

-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ def run_module():
5454
state_files,
5555
mmwr_info,
5656
params["export_dir"],
57-
params["static_file_dir"],
5857
start_date,
5958
end_date)
6059

cdc_covidnet/delphi_cdc_covidnet/update_sensor.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
import numpy as np
1313
import pandas as pd
1414

15-
from delphi_utils import read_params
15+
from delphi_utils import read_params, GeoMapper
1616
import covidcast
1717
from .api_config import APIConfig
1818
from .covidnet import CovidNet
19-
from .geo_maps import GeoMaps
2019
from .constants import SIGNALS
2120

2221
def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str):
@@ -49,17 +48,18 @@ def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str):
4948

5049

5150
def update_sensor(
52-
state_files: List[str], mmwr_info: pd.DataFrame,
53-
output_path: str, static_path: str,
54-
start_date: datetime, end_date: datetime) -> pd.DataFrame:
51+
state_files: List[str],
52+
mmwr_info: pd.DataFrame,
53+
output_path: str,
54+
start_date: datetime,
55+
end_date: datetime) -> pd.DataFrame:
5556
"""
5657
Generate sensor values, and write to csv format.
5758
5859
Args:
5960
state_files: List of JSON files representing COVID-NET hospitalization data for each state
6061
mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame
6162
output_path: Path to write the csvs to
62-
static_path: Path for the static geographic fiels
6363
start_date: First sensor date (datetime.datetime)
6464
end_date: Last sensor date (datetime.datetime)
6565
@@ -85,9 +85,15 @@ def update_sensor(
8585
]
8686

8787
# Set state id to two-letter abbreviation
88-
geo_map = GeoMaps(static_path)
89-
hosp_df = geo_map.state_name_to_abbr(hosp_df)
90-
88+
gmpr = GeoMapper()
89+
hosp_df = gmpr.add_geocode(hosp_df,
90+
from_col=APIConfig.STATE_COL,
91+
from_code="state_name",
92+
new_code="state_id",
93+
dropna=False)
94+
# To use the original column name, reassign original column and drop new one
95+
hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"].str.upper()
96+
hosp_df.drop("state_id", axis=1, inplace=True)
9197
assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
9298
hosp_df.set_index(["date", "geo_id"], inplace=True)
9399

0 commit comments

Comments
 (0)