15
15
from typing import Optional
16
16
17
17
import git
18
+ import pandas as pd
18
19
from pathy import Pathy
19
20
from pydantic import BaseModel , Field , root_validator , validator
20
21
22
+ # nowcasting_dataset imports
21
23
from nowcasting_dataset .consts import (
22
24
DEFAULT_N_GSP_PER_EXAMPLE ,
23
25
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE ,
24
26
NWP_VARIABLE_NAMES ,
25
27
SAT_VARIABLE_NAMES ,
26
28
)
27
-
29
+ from nowcasting_dataset . dataset . split import split
28
30
29
31
IMAGE_SIZE_PIXELS_FIELD = Field (64 , description = "The number of pixels of the region of interest." )
30
32
METERS_PER_PIXEL_FIELD = Field (2000 , description = "The number of meters per pixel." )
@@ -102,7 +104,7 @@ class Satellite(DataSourceMixin):
102
104
"""Satellite configuration model"""
103
105
104
106
satellite_zarr_path : str = Field (
105
- "gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr" ,
107
+ "gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr" , # noqa: E501
106
108
description = "The path which holds the satellite zarr." ,
107
109
)
108
110
satellite_channels : tuple = Field (
@@ -116,7 +118,7 @@ class NWP(DataSourceMixin):
116
118
"""NWP configuration model"""
117
119
118
120
nwp_zarr_path : str = Field (
119
- "gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr" ,
121
+ "gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr" , # noqa: E501
120
122
description = "The path which holds the NWP zarr." ,
121
123
)
122
124
nwp_channels : tuple = Field (NWP_VARIABLE_NAMES , description = "the channels used in the nwp data" )
@@ -213,7 +215,8 @@ def set_forecast_and_history_minutes(cls, values):
213
215
Run through the different data sources and if the forecast or history minutes are not set,
214
216
then set them to the default values
215
217
"""
216
-
218
+ # It would be much better to use nowcasting_dataset.data_sources.ALL_DATA_SOURCE_NAMES,
219
+ # but that causes a circular import.
217
220
ALL_DATA_SOURCE_NAMES = ("pv" , "satellite" , "nwp" , "gsp" , "topographic" , "sun" )
218
221
enabled_data_sources = [
219
222
data_source_name
@@ -249,8 +252,8 @@ def set_all_to_defaults(cls):
249
252
class OutputData (BaseModel ):
250
253
"""Output data model"""
251
254
252
- filepath : str = Field (
253
- "gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/" ,
255
+ filepath : Pathy = Field (
256
+ Pathy ( "gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/" ) ,
254
257
description = (
255
258
"Where the data is saved to. If this is running on the cloud then should include"
256
259
" 'gs://' or 's3://'"
@@ -262,7 +265,29 @@ class Process(BaseModel):
262
265
"""Pydantic model of how the data is processed"""
263
266
264
267
seed : int = Field (1234 , description = "Random seed, so experiments can be repeatable" )
265
- batch_size : int = Field (32 , description = "the number of examples per batch" )
268
+ batch_size : int = Field (32 , description = "The number of examples per batch" )
269
+ t0_datetime_frequency : pd .Timedelta = Field (
270
+ pd .Timedelta ("5 minutes" ),
271
+ description = (
272
+ "The temporal frequency at which t0 datetimes will be sampled."
273
+ " Can be any string that `pandas.Timedelta()` understands."
274
+ " For example, if this is set to '5 minutes', then, for each example, the t0 datetime"
275
+ " could be at 0, 5, ..., 55 minutes past the hour. If there are DataSources with a"
276
+ " lower sample rate (e.g. half-hourly) then these lower-sample-rate DataSources will"
277
+ " still produce valid examples. For example, if a half-hourly DataSource is asked for"
278
+ " an example with t0=12:05, history_minutes=60, forecast_minutes=60, then it will"
279
+ " return data at 11:30, 12:00, 12:30, and 13:00."
280
+ ),
281
+ )
282
+ split_method : split .SplitMethod = Field (
283
+ split .SplitMethod .DAY ,
284
+ description = (
285
+ "The method used to split the t0 datetimes into train, validation and test sets."
286
+ ),
287
+ )
288
+ n_train_batches : int = 250
289
+ n_validation_batches : int = 10
290
+ n_test_batches : int = 10
266
291
upload_every_n_batches : int = Field (
267
292
16 ,
268
293
description = (
0 commit comments