Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit ab3bdd3

Browse files
authored
Merge pull request #307 from openclimatefix/jack/big-new-design-2
Big new design Part 2 :)
2 parents 0dce7d6 + ce6e5ba commit ab3bdd3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1146
-1749
lines changed

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,13 @@ There does not seem to be an automated way to do this selecting and downloading,
113113
## Configure `nowcasting_dataset` to point to the downloaded data
114114

115115
Copy and modify one of the config yaml files in
116-
[`nowcasting_dataset/config/`](https://github.com/openclimatefix/nowcasting_dataset/tree/main/nowcasting_dataset/config)
117-
and modify `prepare_ml_data.py` to use your config file.
116+
[`nowcasting_dataset/config/`](https://github.com/openclimatefix/nowcasting_dataset/tree/main/nowcasting_dataset/config).
118117

119118

120119
## Prepare ML batches
121120

122-
Run [`scripts/prepare_ml_data.py`](https://github.com/openclimatefix/nowcasting_dataset/blob/main/scripts/prepare_ml_data.py)
121+
Run [`scripts/prepare_ml_data.py --help`](https://github.com/openclimatefix/nowcasting_dataset/blob/main/scripts/prepare_ml_data.py)
122+
to learn how to run the `prepare_ml_data.py` script.
123123

124124

125125
## What exactly is in each batch?

conftest.py

+8-9
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
register_xr_data_set_to_tensor()
2323

2424

25-
def pytest_addoption(parser):
25+
def pytest_addoption(parser): # noqa: D103
2626
parser.addoption(
2727
"--use_cloud_data",
2828
action="store_true",
@@ -32,12 +32,12 @@ def pytest_addoption(parser):
3232

3333

3434
@pytest.fixture
35-
def use_cloud_data(request):
35+
def use_cloud_data(request): # noqa: D103
3636
return request.config.getoption("--use_cloud_data")
3737

3838

3939
@pytest.fixture
40-
def sat_filename(use_cloud_data: bool) -> Path:
40+
def sat_filename(use_cloud_data: bool) -> Path: # noqa: D103
4141
if use_cloud_data:
4242
return consts.SAT_FILENAME
4343
else:
@@ -47,24 +47,23 @@ def sat_filename(use_cloud_data: bool) -> Path:
4747

4848

4949
@pytest.fixture
50-
def sat_data_source(sat_filename: Path):
50+
def sat_data_source(sat_filename: Path): # noqa: D103
5151
return SatelliteDataSource(
5252
image_size_pixels=pytest.IMAGE_SIZE_PIXELS,
5353
zarr_path=sat_filename,
5454
history_minutes=0,
5555
forecast_minutes=5,
5656
channels=("HRV",),
57-
n_timesteps_per_batch=2,
5857
)
5958

6059

6160
@pytest.fixture
62-
def general_data_source():
61+
def general_data_source(): # noqa: D103
6362
return MetadataDataSource(history_minutes=0, forecast_minutes=5, object_at_center="GSP")
6463

6564

6665
@pytest.fixture
67-
def gsp_data_source():
66+
def gsp_data_source(): # noqa: D103
6867
return GSPDataSource(
6968
image_size_pixels=16,
7069
meters_per_pixel=2000,
@@ -75,13 +74,13 @@ def gsp_data_source():
7574

7675

7776
@pytest.fixture
78-
def configuration():
77+
def configuration(): # noqa: D103
7978
filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "gcp.yaml")
8079
configuration = load_yaml_configuration(filename)
8180

8281
return configuration
8382

8483

8584
@pytest.fixture
86-
def test_data_folder():
85+
def test_data_folder(): # noqa: D103
8786
return os.path.join(os.path.dirname(nowcasting_dataset.__file__), "../tests/data")

environment.yml

-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ dependencies:
2828

2929
# Machine learning
3030
- pytorch::pytorch # explicitly specify pytorch channel to prevent conda from using conda-forge for pytorch, and hence installing the CPU-only version.
31-
- pytorch-lightning
3231

3332
# PV & Geospatial
3433
- pvlib
@@ -45,6 +44,4 @@ dependencies:
4544
- pre-commit
4645

4746
- pip:
48-
- neptune-client[pytorch-lightning]
49-
- tilemapbase
5047
- git+https://github.com/SheffieldSolar/PV_Live-API

notebooks/2021-09/2021-09-07/sat_data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""Notebook"""
12
from datetime import datetime
23

34
from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
@@ -9,7 +10,6 @@
910
forecast_len=12,
1011
image_size_pixels=64,
1112
meters_per_pixel=2000,
12-
n_timesteps_per_batch=32,
1313
)
1414

1515
s.open()

nowcasting_dataset/config/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
""" Configuration of the dataset """
2+
from nowcasting_dataset.config.load import load_yaml_configuration
3+
from nowcasting_dataset.config.model import Configuration, InputData, set_git_commit

nowcasting_dataset/config/model.py

+32-7
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,18 @@
1515
from typing import Optional
1616

1717
import git
18+
import pandas as pd
1819
from pathy import Pathy
1920
from pydantic import BaseModel, Field, root_validator, validator
2021

22+
# nowcasting_dataset imports
2123
from nowcasting_dataset.consts import (
2224
DEFAULT_N_GSP_PER_EXAMPLE,
2325
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
2426
NWP_VARIABLE_NAMES,
2527
SAT_VARIABLE_NAMES,
2628
)
27-
29+
from nowcasting_dataset.dataset.split import split
2830

2931
IMAGE_SIZE_PIXELS_FIELD = Field(64, description="The number of pixels of the region of interest.")
3032
METERS_PER_PIXEL_FIELD = Field(2000, description="The number of meters per pixel.")
@@ -102,7 +104,7 @@ class Satellite(DataSourceMixin):
102104
"""Satellite configuration model"""
103105

104106
satellite_zarr_path: str = Field(
105-
"gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr",
107+
"gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr", # noqa: E501
106108
description="The path which holds the satellite zarr.",
107109
)
108110
satellite_channels: tuple = Field(
@@ -116,7 +118,7 @@ class NWP(DataSourceMixin):
116118
"""NWP configuration model"""
117119

118120
nwp_zarr_path: str = Field(
119-
"gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr",
121+
"gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr", # noqa: E501
120122
description="The path which holds the NWP zarr.",
121123
)
122124
nwp_channels: tuple = Field(NWP_VARIABLE_NAMES, description="the channels used in the nwp data")
@@ -213,7 +215,8 @@ def set_forecast_and_history_minutes(cls, values):
213215
Run through the different data sources and if the forecast or history minutes are not set,
214216
then set them to the default values
215217
"""
216-
218+
# It would be much better to use nowcasting_dataset.data_sources.ALL_DATA_SOURCE_NAMES,
219+
# but that causes a circular import.
217220
ALL_DATA_SOURCE_NAMES = ("pv", "satellite", "nwp", "gsp", "topographic", "sun")
218221
enabled_data_sources = [
219222
data_source_name
@@ -249,8 +252,8 @@ def set_all_to_defaults(cls):
249252
class OutputData(BaseModel):
250253
"""Output data model"""
251254

252-
filepath: str = Field(
253-
"gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/",
255+
filepath: Pathy = Field(
256+
Pathy("gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/"),
254257
description=(
255258
"Where the data is saved to. If this is running on the cloud then should include"
256259
" 'gs://' or 's3://'"
@@ -262,7 +265,29 @@ class Process(BaseModel):
262265
"""Pydantic model of how the data is processed"""
263266

264267
seed: int = Field(1234, description="Random seed, so experiments can be repeatable")
265-
batch_size: int = Field(32, description="the number of examples per batch")
268+
batch_size: int = Field(32, description="The number of examples per batch")
269+
t0_datetime_frequency: pd.Timedelta = Field(
270+
pd.Timedelta("5 minutes"),
271+
description=(
272+
"The temporal frequency at which t0 datetimes will be sampled."
273+
" Can be any string that `pandas.Timedelta()` understands."
274+
" For example, if this is set to '5 minutes', then, for each example, the t0 datetime"
275+
" could be at 0, 5, ..., 55 minutes past the hour. If there are DataSources with a"
276+
" lower sample rate (e.g. half-hourly) then these lower-sample-rate DataSources will"
277+
" still produce valid examples. For example, if a half-hourly DataSource is asked for"
278+
" an example with t0=12:05, history_minutes=60, forecast_minutes=60, then it will"
279+
" return data at 11:30, 12:00, 12:30, and 13:00."
280+
),
281+
)
282+
split_method: split.SplitMethod = Field(
283+
split.SplitMethod.DAY,
284+
description=(
285+
"The method used to split the t0 datetimes into train, validation and test sets."
286+
),
287+
)
288+
n_train_batches: int = 250
289+
n_validation_batches: int = 10
290+
n_test_batches: int = 10
266291
upload_every_n_batches: int = Field(
267292
16,
268293
description=(

nowcasting_dataset/config/on_premises.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ input_data:
5656
topographic_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/Topographic/europe_dem_1km_osgb.tif
5757

5858
output_data:
59-
filepath: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v8/
59+
filepath: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v_testing/
6060
process:
6161
batch_size: 32
6262
seed: 1234

nowcasting_dataset/consts.py

+6
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,9 @@
102102
TOPOGRAPHIC_X_COORDS,
103103
] + list(DATETIME_FEATURE_NAMES)
104104
T0_DT = "t0_dt"
105+
106+
107+
SPATIAL_AND_TEMPORAL_LOCATIONS_OF_EACH_EXAMPLE_FILENAME = (
108+
"spatial_and_temporal_locations_of_each_example.csv"
109+
)
110+
SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES = ("t0_datetime_UTC", "x_center_OSGB", "y_center_OSGB")
+15-3
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
11
""" Various DataSources """
2-
from nowcasting_dataset.data_sources.data_source import DataSource
3-
from nowcasting_dataset.data_sources.datetime.datetime_data_source import DatetimeDataSource
2+
from nowcasting_dataset.data_sources.data_source import DataSource # noqa: F401
3+
from nowcasting_dataset.data_sources.datetime.datetime_data_source import ( # noqa: F401
4+
DatetimeDataSource,
5+
)
6+
from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
47
from nowcasting_dataset.data_sources.nwp.nwp_data_source import NWPDataSource
58
from nowcasting_dataset.data_sources.pv.pv_data_source import PVDataSource
69
from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
7-
from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
810
from nowcasting_dataset.data_sources.sun.sun_data_source import SunDataSource
911
from nowcasting_dataset.data_sources.topographic.topographic_data_source import (
1012
TopographicDataSource,
1113
)
14+
15+
MAP_DATA_SOURCE_NAME_TO_CLASS = {
16+
"pv": PVDataSource,
17+
"satellite": SatelliteDataSource,
18+
"nwp": NWPDataSource,
19+
"gsp": GSPDataSource,
20+
"topographic": TopographicDataSource,
21+
"sun": SunDataSource,
22+
}
23+
ALL_DATA_SOURCE_NAMES = tuple(MAP_DATA_SOURCE_NAME_TO_CLASS.keys())

0 commit comments

Comments
 (0)