Add .iea_web.fuzz_data() and tests

khaeru · khaeru · commit 7537165c089d · 2024-01-17T15:10:11.000+01:00
diff --git a/message_ix_models/tests/test_tools.py b/message_ix_models/tests/test_tools.py
@@ -33,3 +33,8 @@ def test_generate_code_lists(test_context, tmp_path):
 
     # generate_code_lists() runs
     iea_web.generate_code_lists(tmp_path)
+
+
+def test_fuzz_data(test_context, tmp_path):
+    # fuzz_data() runs
+    iea_web.fuzz_data(target_path=tmp_path)
diff --git a/message_ix_models/tools/iea_web.py b/message_ix_models/tools/iea_web.py
@@ -2,6 +2,7 @@
 import logging
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 import yaml
 from iam_units import registry
@@ -24,9 +25,9 @@
 
 #: File name containing data.
 FILE = "WBAL_12052022124930839.csv"
-FILE = "cac5fa90-en.zip"
+# FILE = "cac5fa90-en.zip"
 
-NROWS = 1e7
+NROWS = 1e6
 
 
 def _read(base_path=None, **kwargs) -> pd.DataFrame:
@@ -142,3 +143,18 @@ def _check1(value):
         cl_path.write_text(yaml.dump(data))
 
 
+def fuzz_data(base_path=None, target_path=None):
+    """Generate a fuzzed subset of the data for testing."""
+    df = _read(base_path)
+
+    # - Reduce the data by only taking 2 periods for each (flow, product, country).
+    # - Replace the actual values with random.
+    df = (
+        df.groupby(["FLOW", "PRODUCT", "COUNTRY"])
+        .take([0, -1])
+        .reset_index(drop=True)
+        .assign(Value=lambda df: np.random.rand(len(df)))
+    )
+
+    # TODO write to file
+    # path = (target_path or package_data_path("iea")).joinpath(f"fuzzed-{FILE}")