Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 0642371

Browse files
committed
Implement DataSource.get_t0_datetimes(). Simplify time.get_t0_datetimes(). Change README so it reflects the fact that we're no longer supporting on-the-fly loading. Tests pass. Still need to simplify DataModule._get_datetimes() and remove fill_30_minutes_timestamps_to_5_minutes(). #204 #213
1 parent 2d7d977 commit 0642371

File tree

7 files changed

+68
-137
lines changed

7 files changed

+68
-137
lines changed

README.md

+11-4
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
# nowcasting_dataset
2-
A multi-process data loader for PyTorch which aligns three separate datasets:
2+
Pre-prepare batches of data for use in machine learning training.
3+
4+
This code combines several data sources including:
35

46
* Satellite imagery (EUMETSAT SEVIRI RSS 5-minutely data of UK)
57
* Numerical Weather Predictions (NWPs. UK Met Office UKV model from CEDA)
68
* Solar PV power timeseries data (from PVOutput.org, downloaded using
79
our [pvoutput Python
810
code](https://github.com/openclimatefix/pvoutput).)
11+
* Topographic data.
12+
* The Sun's azimuth and angle.
913

14+
# History of nowcasting_dataset
1015
When we first started writing `nowcasting_dataset`, our intention was
1116
to load and align data from these three datasets on-the-fly during ML
12-
training. And `nowcasting_dataset` can still be used that way! But
13-
it just isn't quite fast enough to keep a modern GPU constantly fed
17+
training. But it just isn't quite fast enough to keep a modern GPU constantly fed
1418
with data when loading multiple satellite channels and multiple NWP
1519
parameters. So, now, this code is used to pre-prepare thousands of
1620
batches, and save these batches to disk, each as a separate NetCDF
@@ -78,4 +82,7 @@ To test using the full dataset on Google Cloud, add the `--use_cloud_data` switc
7882

7983
# Documentation
8084

81-
Please see the [`Example` class](https://github.com/openclimatefix/nowcasting_dataset/blob/main/nowcasting_dataset/dataset/example.py) for documentation about the different data fields in each example / batch.
85+
Please see the `data_sources/<modality>/<modality>_model.py` files
86+
(where `<modality>` is one of {datetime, metadata, gsp, nwp, pv,
87+
satellite, sun, topographic}) for documentation about the different
88+
data fields in each example / batch

notebooks/design.ipynb

-92
This file was deleted.

nowcasting_dataset/data_sources/data_source.py

+32-6
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ class DataSource:
3030
will consist of a single timestep at t0.
3131
convert_to_numpy: Whether or not to convert each example to numpy.
3232
sample_period_minutes: The time delta between each data point
33+
34+
Attributes ending in `_len` are sequence lengths represented as integer numbers of timesteps.
35+
Attributes ending in `_dur` are sequence durations represented as pd.Timedeltas.
3336
"""
3437

3538
history_minutes: int
@@ -39,7 +42,9 @@ class DataSource:
3942
def __post_init__(self):
4043
""" Post Init """
4144
self.sample_period_minutes = self._get_sample_period_minutes()
45+
self.sample_period_dur = pd.Timedelta(self.sample_period_minutes, unit="minutes")
4246

47+
# TODO: Do we still need all these different representations of sequence lengths? #219
4348
self.history_len = self.history_minutes // self.sample_period_minutes
4449
self.forecast_len = self.forecast_minutes // self.sample_period_minutes
4550

@@ -56,12 +61,11 @@ def __post_init__(self):
5661

5762
# Plus 1 because neither history_len nor forecast_len include t0.
5863
self._total_seq_len = self.history_len + self.forecast_len + 1
59-
self._history_dur = nd_time.timesteps_to_duration(
60-
self.history_len, self.sample_period_minutes
61-
)
62-
self._forecast_dur = nd_time.timesteps_to_duration(
63-
self.forecast_len, self.sample_period_minutes
64-
)
64+
65+
self._history_dur = pd.Timedelta(self.history_minutes, unit="minutes")
66+
self._forecast_dur = pd.Timedelta(self.forecast_minutes, unit="minutes")
67+
# Add sample_period_duration because neither history_dur not forecast_dur include t0.
68+
self._total_seq_dur = self._history_dur + self._forecast_dur + self.sample_period_dur
6569

6670
def _get_start_dt(self, t0_dt: pd.Timestamp) -> pd.Timestamp:
6771
return t0_dt - self._history_dur
@@ -132,6 +136,28 @@ def datetime_index(self) -> pd.DatetimeIndex:
132136
# of a list of datetimes (e.g. for DatetimeDataSource).
133137
raise NotImplementedError()
134138

139+
def get_t0_datetimes(self) -> pd.DatetimeIndex:
140+
"""Get all the valid t0 datetimes.
141+
142+
In each example timeseries, t0 is the datetime of the most recent observation.
143+
t0 is used to specify the temporal location of each example.
144+
145+
Returns all t0 datetimes which identify valid, contiguous example timeseries.
146+
In other words, this function returns all datetimes which come after at least
147+
history_minutes of contiguous samples; and which have at least forecast_minutes of
148+
contiguous data ahead.
149+
150+
Raises NotImplementedError if self.datetime_index() raises NotImplementedError,
151+
which means that this DataSource doesn't have a concept of a list of datetimes.
152+
"""
153+
all_datetimes = self.datetime_index()
154+
return nd_time.get_t0_datetimes(
155+
datetimes=all_datetimes,
156+
total_seq_len=self._total_seq_len,
157+
history_dur=self._history_dur,
158+
max_gap=self.sample_period_dur,
159+
)
160+
135161
def _get_time_slice(self, t0_dt: pd.Timestamp):
136162
"""Get a single timestep of data. Must be overridden."""
137163
raise NotImplementedError()

nowcasting_dataset/dataset/datamodule.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def _get_datetimes(
421421
t0_datetimes = nd_time.get_t0_datetimes(
422422
datetimes=dt_index,
423423
total_seq_len=self._total_seq_len_5_minutes,
424-
history_len=self.history_len_5_minutes,
424+
history_dur=self.history_len_5_minutes * nd_time.FIVE_MINUTES,
425425
)
426426

427427
# only select datetimes for half hours, ignore 5 minute timestamps

nowcasting_dataset/time.py

+11-20
Original file line numberDiff line numberDiff line change
@@ -113,42 +113,33 @@ def get_start_datetimes(
113113
def get_t0_datetimes(
114114
datetimes: pd.DatetimeIndex,
115115
total_seq_len: int,
116-
history_len: int,
117-
minute_delta: int = 5,
116+
history_dur: pd.Timedelta,
118117
max_gap: pd.Timedelta = FIVE_MINUTES,
119118
) -> pd.DatetimeIndex:
120119
"""
121-
Get datetimes for ML learning batches. T0 refers to the time 'now'.
120+
Get T0 datetimes for ML learning batches. T0 refers to the time of the most recent observation.
122121
123122
Args:
124-
datetimes: list of datetimes when data is available
125-
total_seq_len: total sequence length of data for ml model
126-
history_len: the number of historic timestemps
127-
minute_delta: the amount of minutes in one time step
128-
max_gap: The maximum allowed gap in the datetimes for it to be valid
129-
130-
Returns: Datetimes that ml learning data can be built around.
131-
123+
datetimes: Datetimes of every valid timestep.
124+
total_seq_len: Total sequence length (number of timesteps) of each example sequence.
125+
total_seq_len = history_len + forecast_len + 1
126+
(the plus 1 is because neither history_len nor forecast_len include t0).
127+
history_dur: The duration of the history included in each example sequence.
128+
max_gap: The maximum allowed gap in the datetimes for it to be valid.
129+
130+
Returns: T0 datetimes that identify valid, contiguous sequences at least total_seq_len long.
132131
"""
133132
logger.debug("Getting t0 datetimes")
134133

135134
start_datetimes = get_start_datetimes(
136135
datetimes=datetimes, total_seq_len=total_seq_len, max_gap=max_gap
137136
)
138137

139-
logger.debug("Adding history during to t0 datetimes")
140-
history_dur = timesteps_to_duration(history_len, minute_delta=minute_delta)
138+
logger.debug("Adding history duration to t0 datetimes")
141139
t0_datetimes = start_datetimes + history_dur
142-
143140
return t0_datetimes
144141

145142

146-
def timesteps_to_duration(n_timesteps: int, minute_delta: int = 5) -> pd.Timedelta:
147-
"""Change timesteps to a time duration"""
148-
assert n_timesteps >= 0
149-
return pd.Timedelta(n_timesteps * minute_delta, unit="minutes")
150-
151-
152143
def datetime_features(index: pd.DatetimeIndex) -> pd.DataFrame:
153144
"""
154145
Make datetime features, hour_of_day and day_of_year

tests/test_dataset.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
import pandas as pd
23
import pytest
34

45
import nowcasting_dataset.time as nd_time
@@ -9,7 +10,9 @@
910
@pytest.fixture
1011
def dataset(sat_data_source, general_data_source):
1112
all_datetimes = sat_data_source.datetime_index()
12-
t0_datetimes = nd_time.get_t0_datetimes(datetimes=all_datetimes, total_seq_len=2, history_len=0)
13+
t0_datetimes = nd_time.get_t0_datetimes(
14+
datetimes=all_datetimes, total_seq_len=2, history_dur=pd.Timedelta(0)
15+
)
1316
return NowcastingDataset(
1417
batch_size=8,
1518
n_batches_per_epoch_per_worker=64,
@@ -25,8 +28,7 @@ def dataset_gsp(gsp_data_source, general_data_source):
2528
t0_datetimes = nd_time.get_t0_datetimes(
2629
datetimes=all_datetimes,
2730
total_seq_len=2,
28-
history_len=0,
29-
minute_delta=30,
31+
history_dur=pd.Timedelta(0),
3032
max_gap=nd_time.THIRTY_MINUTES,
3133
)
3234

tests/test_time.py

+8-11
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,6 @@ def test_get_start_datetimes_2(total_seq_len):
5757
np.testing.assert_array_equal(start_datetimes, correct_start_datetimes)
5858

5959

60-
def test_timesteps_to_duration():
61-
assert nd_time.timesteps_to_duration(0) == pd.Timedelta(0)
62-
assert nd_time.timesteps_to_duration(1) == pd.Timedelta("5T")
63-
assert nd_time.timesteps_to_duration(12) == pd.Timedelta("1H")
64-
65-
6660
def test_datetime_features_in_example():
6761
index = pd.date_range("2020-01-01", "2020-01-06 23:00", freq="h")
6862
example = nd_time.datetime_features_in_example(index)
@@ -79,13 +73,14 @@ def test_datetime_features_in_example():
7973
def test_get_t0_datetimes(history_length, forecast_length):
8074
index = pd.date_range("2020-01-01", "2020-01-06 23:00", freq="30T")
8175
total_seq_len = history_length + forecast_length + 1
76+
sample_period_dur = THIRTY_MINUTES
77+
history_dur = sample_period_dur * history_length
8278

8379
t0_datetimes = nd_time.get_t0_datetimes(
8480
datetimes=index,
8581
total_seq_len=total_seq_len,
86-
history_len=history_length,
82+
history_dur=history_dur,
8783
max_gap=THIRTY_MINUTES,
88-
minute_delta=30,
8984
)
9085

9186
assert len(t0_datetimes) == len(index) - history_length - forecast_length
@@ -96,14 +91,16 @@ def test_get_t0_datetimes(history_length, forecast_length):
9691
def test_get_t0_datetimes_night():
9792
history_length = 6
9893
forecast_length = 12
99-
index = pd.date_range("2020-06-15", "2020-06-15 22:15", freq="5T")
94+
sample_period_dur = FIVE_MINUTES
95+
index = pd.date_range("2020-06-15", "2020-06-15 22:15", freq=sample_period_dur)
10096
total_seq_len = history_length + forecast_length + 1
97+
history_dur = history_length * sample_period_dur
10198

10299
t0_datetimes = nd_time.get_t0_datetimes(
103100
datetimes=index,
104101
total_seq_len=total_seq_len,
105-
history_len=history_length,
106-
max_gap=FIVE_MINUTES,
102+
history_dur=history_dur,
103+
max_gap=sample_period_dur,
107104
)
108105

109106
assert len(t0_datetimes) == len(index) - history_length - forecast_length

0 commit comments

Comments
 (0)