Implement DataSource.get_t0_datetimes(). Simplify time.get_t0_datetimes(). Change README so it reflects the fact that we're no longer supporting on-the-fly loading. Tests pass. Still need to simplify DataModule._get_datetimes() and remove fill_30_minutes_timestamps_to_5_minutes(). #204 #213

JackKelly · JackKelly · commit 064237190940 · 2021-10-12T14:48:54.000Z
diff --git a/README.md b/README.md
@@ -1,16 +1,20 @@
 # nowcasting_dataset
-A multi-process data loader for PyTorch which aligns three separate datasets:
+Pre-prepare batches of data for use in machine learning training.
+
+This code combines several data sources including:
 
 * Satellite imagery (EUMETSAT SEVIRI RSS 5-minutely data of UK)
 * Numerical Weather Predictions (NWPs.  UK Met Office UKV model from CEDA)
 * Solar PV power timeseries data (from PVOutput.org, downloaded using
   our [pvoutput Python
   code](https://github.com/openclimatefix/pvoutput).)
+* Topographic data.
+* The Sun's azimuth and angle.
 
+# History of nowcasting_dataset
 When we first started writing `nowcasting_dataset`, our intention was
 to load and align data from these three datasets on-the-fly during ML
-training.  And `nowcasting_dataset` can still be used that way!  But
-it just isn't quite fast enough to keep a modern GPU constantly fed
+training.  But it just isn't quite fast enough to keep a modern GPU constantly fed
 with data when loading multiple satellite channels and multiple NWP
 parameters.  So, now, this code is used to pre-prepare thousands of
 batches, and save these batches to disk, each as a separate NetCDF
@@ -78,4 +82,7 @@ To test using the full dataset on Google Cloud, add the `--use_cloud_data` switc
 
 # Documentation
 
-Please see the [`Example` class](https://github.com/openclimatefix/nowcasting_dataset/blob/main/nowcasting_dataset/dataset/example.py) for documentation about the different data fields in each example / batch.
+Please see the `data_sources/<modality>/<modality>_model.py` files
+(where `<modality>` is one of {datetime, metadata, gsp, nwp, pv,
+satellite, sun, topographic}) for documentation about the different
+data fields in each example / batch
diff --git a/notebooks/design.ipynb b/notebooks/design.ipynb
diff --git a/nowcasting_dataset/data_sources/data_source.py b/nowcasting_dataset/data_sources/data_source.py
@@ -30,6 +30,9 @@ class DataSource:
         will consist of a single timestep at t0.
       convert_to_numpy: Whether or not to convert each example to numpy.
       sample_period_minutes: The time delta between each data point
+
+    Attributes ending in `_len` are sequence lengths represented as integer numbers of timesteps.
+    Attributes ending in `_dur` are sequence durations represented as pd.Timedeltas.
     """
 
     history_minutes: int
@@ -39,7 +42,9 @@ class DataSource:
     def __post_init__(self):
         """ Post Init """
         self.sample_period_minutes = self._get_sample_period_minutes()
+        self.sample_period_dur = pd.Timedelta(self.sample_period_minutes, unit="minutes")
 
+        # TODO: Do we still need all these different representations of sequence lengths? #219
         self.history_len = self.history_minutes // self.sample_period_minutes
         self.forecast_len = self.forecast_minutes // self.sample_period_minutes
 
@@ -56,12 +61,11 @@ def __post_init__(self):
 
         # Plus 1 because neither history_len nor forecast_len include t0.
         self._total_seq_len = self.history_len + self.forecast_len + 1
-        self._history_dur = nd_time.timesteps_to_duration(
-            self.history_len, self.sample_period_minutes
-        )
-        self._forecast_dur = nd_time.timesteps_to_duration(
-            self.forecast_len, self.sample_period_minutes
-        )
+
+        self._history_dur = pd.Timedelta(self.history_minutes, unit="minutes")
+        self._forecast_dur = pd.Timedelta(self.forecast_minutes, unit="minutes")
+        # Add sample_period_duration because neither history_dur not forecast_dur include t0.
+        self._total_seq_dur = self._history_dur + self._forecast_dur + self.sample_period_dur
 
     def _get_start_dt(self, t0_dt: pd.Timestamp) -> pd.Timestamp:
         return t0_dt - self._history_dur
@@ -132,6 +136,28 @@ def datetime_index(self) -> pd.DatetimeIndex:
         # of a list of datetimes (e.g. for DatetimeDataSource).
         raise NotImplementedError()
 
+    def get_t0_datetimes(self) -> pd.DatetimeIndex:
+        """Get all the valid t0 datetimes.
+
+        In each example timeseries, t0 is the datetime of the most recent observation.
+        t0 is used to specify the temporal location of each example.
+
+        Returns all t0 datetimes which identify valid, contiguous example timeseries.
+        In other words, this function returns all datetimes which come after at least
+        history_minutes of contiguous samples; and which have at least forecast_minutes of
+        contiguous data ahead.
+
+        Raises NotImplementedError if self.datetime_index() raises NotImplementedError,
+        which means that this DataSource doesn't have a concept of a list of datetimes.
+        """
+        all_datetimes = self.datetime_index()
+        return nd_time.get_t0_datetimes(
+            datetimes=all_datetimes,
+            total_seq_len=self._total_seq_len,
+            history_dur=self._history_dur,
+            max_gap=self.sample_period_dur,
+        )
+
     def _get_time_slice(self, t0_dt: pd.Timestamp):
         """Get a single timestep of data.  Must be overridden."""
         raise NotImplementedError()
diff --git a/nowcasting_dataset/dataset/datamodule.py b/nowcasting_dataset/dataset/datamodule.py
@@ -421,7 +421,7 @@ def _get_datetimes(
         t0_datetimes = nd_time.get_t0_datetimes(
             datetimes=dt_index,
             total_seq_len=self._total_seq_len_5_minutes,
-            history_len=self.history_len_5_minutes,
+            history_dur=self.history_len_5_minutes * nd_time.FIVE_MINUTES,
         )
 
         # only select datetimes for half hours, ignore 5 minute timestamps
diff --git a/nowcasting_dataset/time.py b/nowcasting_dataset/time.py
@@ -113,42 +113,33 @@ def get_start_datetimes(
 def get_t0_datetimes(
     datetimes: pd.DatetimeIndex,
     total_seq_len: int,
-    history_len: int,
-    minute_delta: int = 5,
+    history_dur: pd.Timedelta,
     max_gap: pd.Timedelta = FIVE_MINUTES,
 ) -> pd.DatetimeIndex:
     """
-    Get datetimes for ML learning batches. T0 refers to the time 'now'.
+    Get T0 datetimes for ML learning batches. T0 refers to the time of the most recent observation.
 
     Args:
-        datetimes: list of datetimes when data is available
-        total_seq_len: total sequence length of data for ml model
-        history_len: the number of historic timestemps
-        minute_delta: the amount of minutes in one time step
-        max_gap: The maximum allowed gap in the datetimes for it to be valid
-
-    Returns: Datetimes that ml learning data can be built around.
-
+        datetimes: Datetimes of every valid timestep.
+        total_seq_len: Total sequence length (number of timesteps) of each example sequence.
+            total_seq_len = history_len + forecast_len + 1
+            (the plus 1 is because neither history_len nor forecast_len include t0).
+        history_dur: The duration of the history included in each example sequence.
+        max_gap: The maximum allowed gap in the datetimes for it to be valid.
+
+    Returns: T0 datetimes that identify valid, contiguous sequences at least total_seq_len long.
     """
     logger.debug("Getting t0 datetimes")
 
     start_datetimes = get_start_datetimes(
         datetimes=datetimes, total_seq_len=total_seq_len, max_gap=max_gap
     )
 
-    logger.debug("Adding history during to t0 datetimes")
-    history_dur = timesteps_to_duration(history_len, minute_delta=minute_delta)
+    logger.debug("Adding history duration to t0 datetimes")
     t0_datetimes = start_datetimes + history_dur
-
     return t0_datetimes
 
 
-def timesteps_to_duration(n_timesteps: int, minute_delta: int = 5) -> pd.Timedelta:
-    """Change timesteps to a time duration"""
-    assert n_timesteps >= 0
-    return pd.Timedelta(n_timesteps * minute_delta, unit="minutes")
-
-
 def datetime_features(index: pd.DatetimeIndex) -> pd.DataFrame:
     """
     Make datetime features, hour_of_day and day_of_year
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import pytest
 
 import nowcasting_dataset.time as nd_time
@@ -9,7 +10,9 @@
 @pytest.fixture
 def dataset(sat_data_source, general_data_source):
     all_datetimes = sat_data_source.datetime_index()
-    t0_datetimes = nd_time.get_t0_datetimes(datetimes=all_datetimes, total_seq_len=2, history_len=0)
+    t0_datetimes = nd_time.get_t0_datetimes(
+        datetimes=all_datetimes, total_seq_len=2, history_dur=pd.Timedelta(0)
+    )
     return NowcastingDataset(
         batch_size=8,
         n_batches_per_epoch_per_worker=64,
@@ -25,8 +28,7 @@ def dataset_gsp(gsp_data_source, general_data_source):
     t0_datetimes = nd_time.get_t0_datetimes(
         datetimes=all_datetimes,
         total_seq_len=2,
-        history_len=0,
-        minute_delta=30,
+        history_dur=pd.Timedelta(0),
         max_gap=nd_time.THIRTY_MINUTES,
     )
 
diff --git a/tests/test_time.py b/tests/test_time.py
@@ -57,12 +57,6 @@ def test_get_start_datetimes_2(total_seq_len):
     np.testing.assert_array_equal(start_datetimes, correct_start_datetimes)
 
 
-def test_timesteps_to_duration():
-    assert nd_time.timesteps_to_duration(0) == pd.Timedelta(0)
-    assert nd_time.timesteps_to_duration(1) == pd.Timedelta("5T")
-    assert nd_time.timesteps_to_duration(12) == pd.Timedelta("1H")
-
-
 def test_datetime_features_in_example():
     index = pd.date_range("2020-01-01", "2020-01-06 23:00", freq="h")
     example = nd_time.datetime_features_in_example(index)
@@ -79,13 +73,14 @@ def test_datetime_features_in_example():
 def test_get_t0_datetimes(history_length, forecast_length):
     index = pd.date_range("2020-01-01", "2020-01-06 23:00", freq="30T")
     total_seq_len = history_length + forecast_length + 1
+    sample_period_dur = THIRTY_MINUTES
+    history_dur = sample_period_dur * history_length
 
     t0_datetimes = nd_time.get_t0_datetimes(
         datetimes=index,
         total_seq_len=total_seq_len,
-        history_len=history_length,
+        history_dur=history_dur,
         max_gap=THIRTY_MINUTES,
-        minute_delta=30,
     )
 
     assert len(t0_datetimes) == len(index) - history_length - forecast_length
@@ -96,14 +91,16 @@ def test_get_t0_datetimes(history_length, forecast_length):
 def test_get_t0_datetimes_night():
     history_length = 6
     forecast_length = 12
-    index = pd.date_range("2020-06-15", "2020-06-15 22:15", freq="5T")
+    sample_period_dur = FIVE_MINUTES
+    index = pd.date_range("2020-06-15", "2020-06-15 22:15", freq=sample_period_dur)
     total_seq_len = history_length + forecast_length + 1
+    history_dur = history_length * sample_period_dur
 
     t0_datetimes = nd_time.get_t0_datetimes(
         datetimes=index,
         total_seq_len=total_seq_len,
-        history_len=history_length,
-        max_gap=FIVE_MINUTES,
+        history_dur=history_dur,
+        max_gap=sample_period_dur,
     )
 
     assert len(t0_datetimes) == len(index) - history_length - forecast_length

Original file line number	Diff line number	Diff line change
`@@ -421,7 +421,7 @@ def _get_datetimes(`
`421`	`421`	`t0_datetimes = nd_time.get_t0_datetimes(`
`422`	`422`	`datetimes=dt_index,`
`423`	`423`	`total_seq_len=self._total_seq_len_5_minutes,`
`424`		`- history_len=self.history_len_5_minutes,`
	`424`	`+ history_dur=self.history_len_5_minutes * nd_time.FIVE_MINUTES,`
`425`	`425`	`)`
`426`	`426`
`427`	`427`	`# only select datetimes for half hours, ignore 5 minute timestamps`