Skip to content

Commit 5aeb5e2

Browse files
committed
Generalize iea_web read step in a function
1 parent ec52514 commit 5aeb5e2

File tree

1 file changed

+20
-17
lines changed

1 file changed

+20
-17
lines changed

message_ix_models/tools/iea_web.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,23 @@
2929
NROWS = 1e7
3030

3131

32+
def _read(base_path=None, **kwargs) -> pd.DataFrame:
33+
base_path = base_path or package_data_path("iea")
34+
path = base_path.joinpath(FILE)
35+
36+
log.info(f"Read {path}")
37+
if "nrows" not in kwargs:
38+
# Only uncomment (a) or (b):
39+
# (a)
40+
log.warning(f"Development; only load {NROWS:.0f} observations")
41+
kwargs["nrows"] = NROWS
42+
43+
# (b); fails (exhausts memory)
44+
# kwargs.setdefault("engine", "pyarrow")
45+
46+
return pd.read_csv(path, **kwargs)
47+
48+
3249
@cached
3350
def load_data(base_path=None) -> pd.DataFrame:
3451
"""Load data from the IEA World Energy Balances.
@@ -53,25 +70,13 @@ def load_data(base_path=None) -> pd.DataFrame:
5370
- unit
5471
- flag
5572
"""
56-
base_path = base_path or package_data_path("iea")
57-
path = base_path.joinpath(FILE)
58-
59-
log.warning(f"Development; only {NROWS} loaded")
60-
61-
return pd.read_csv(path, usecols=COLUMNS.keys(), nrows=NROWS).rename(
62-
columns=COLUMNS
63-
)
73+
return _read(base_path, usecols=COLUMNS.keys()).rename(columns=COLUMNS)
6474

6575

6676
def generate_code_lists(base_path: Path = None) -> None:
6777
"""Extract structure from the data itself."""
68-
base_path = base_path or package_data_path("iea")
69-
path = base_path.joinpath(FILE)
70-
71-
log.info(f"Extract structure from {path}")
72-
7378
# 'Peek' at the data to inspect the column headers
74-
peek = pd.read_csv(path, nrows=1)
79+
peek = _read(base_path, nrows=1)
7580
unit_id_column = peek.columns[0]
7681

7782
# Country names that are already in pycountry
@@ -97,13 +102,11 @@ def _check1(value):
97102
(unit_id_column, "Unit"),
98103
("Flag Codes", "Flags"),
99104
]:
100-
log.warning(f"Development; only {NROWS} loaded")
101-
102105
# - Re-read the data, only two columns; slower, but less overhead
103106
# - Drop empty rows and duplicates.
104107
# - Drop 'trivial' values, where the name and id are identical.
105108
df = (
106-
pd.read_csv(path, usecols=[id, name], nrows=NROWS)
109+
_read(base_path, usecols=[id, name])
107110
.set_axis(["id", "name"], axis=1)
108111
.dropna(how="all")
109112
.drop_duplicates()

0 commit comments

Comments
 (0)