29
29
NROWS = 1e7
30
30
31
31
32
+ def _read (base_path = None , ** kwargs ) -> pd .DataFrame :
33
+ base_path = base_path or package_data_path ("iea" )
34
+ path = base_path .joinpath (FILE )
35
+
36
+ log .info (f"Read { path } " )
37
+ if "nrows" not in kwargs :
38
+ # Only uncomment (a) or (b):
39
+ # (a)
40
+ log .warning (f"Development; only load { NROWS :.0f} observations" )
41
+ kwargs ["nrows" ] = NROWS
42
+
43
+ # (b); fails (exhausts memory)
44
+ # kwargs.setdefault("engine", "pyarrow")
45
+
46
+ return pd .read_csv (path , ** kwargs )
47
+
48
+
32
49
@cached
33
50
def load_data (base_path = None ) -> pd .DataFrame :
34
51
"""Load data from the IEA World Energy Balances.
@@ -53,25 +70,13 @@ def load_data(base_path=None) -> pd.DataFrame:
53
70
- unit
54
71
- flag
55
72
"""
56
- base_path = base_path or package_data_path ("iea" )
57
- path = base_path .joinpath (FILE )
58
-
59
- log .warning (f"Development; only { NROWS } loaded" )
60
-
61
- return pd .read_csv (path , usecols = COLUMNS .keys (), nrows = NROWS ).rename (
62
- columns = COLUMNS
63
- )
73
+ return _read (base_path , usecols = COLUMNS .keys ()).rename (columns = COLUMNS )
64
74
65
75
66
76
def generate_code_lists (base_path : Path = None ) -> None :
67
77
"""Extract structure from the data itself."""
68
- base_path = base_path or package_data_path ("iea" )
69
- path = base_path .joinpath (FILE )
70
-
71
- log .info (f"Extract structure from { path } " )
72
-
73
78
# 'Peek' at the data to inspect the column headers
74
- peek = pd . read_csv ( path , nrows = 1 )
79
+ peek = _read ( base_path , nrows = 1 )
75
80
unit_id_column = peek .columns [0 ]
76
81
77
82
# Country names that are already in pycountry
@@ -97,13 +102,11 @@ def _check1(value):
97
102
(unit_id_column , "Unit" ),
98
103
("Flag Codes" , "Flags" ),
99
104
]:
100
- log .warning (f"Development; only { NROWS } loaded" )
101
-
102
105
# - Re-read the data, only two columns; slower, but less overhead
103
106
# - Drop empty rows and duplicates.
104
107
# - Drop 'trivial' values, where the name and id are identical.
105
108
df = (
106
- pd . read_csv ( path , usecols = [id , name ], nrows = NROWS )
109
+ _read ( base_path , usecols = [id , name ])
107
110
.set_axis (["id" , "name" ], axis = 1 )
108
111
.dropna (how = "all" )
109
112
.drop_duplicates ()
0 commit comments