Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Issue/233 data validation #258

Merged
merged 9 commits into from
Oct 22, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions nowcasting_dataset/data_sources/fake.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ def create_gsp_pv_dataset(
data["x_coords"] = x_coords
data["y_coords"] = y_coords

data.__setitem__("data", data.data.clip(min=0))

return data


Expand Down Expand Up @@ -275,6 +277,9 @@ def create_sun_dataset(
sun = data.rename({"data": "elevation"})
sun["azimuth"] = data.data

sun.__setitem__("azimuth", sun.azimuth.clip(min=0, max=360))
sun.__setitem__("elevation", sun.elevation.clip(min=-90, max=90))

return sun


Expand Down
4 changes: 2 additions & 2 deletions nowcasting_dataset/data_sources/gsp/gsp_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ def get_example(
gsp["x_coords"] = gsp_x_coords
gsp["y_coords"] = gsp_y_coords

# pad out so that there are always 32 gsp
# pad out so that there are always 32 gsp, fill with 0
pad_n = self.n_gsp_per_example - len(gsp.id_index)
gsp = gsp.pad(id_index=(0, pad_n), data=((0, 0), (0, pad_n)))
gsp = gsp.pad(id_index=(0, pad_n), data=((0, 0), (0, pad_n)), constant_values=0)

gsp.__setitem__("id_index", range(self.n_gsp_per_example))

Expand Down
10 changes: 9 additions & 1 deletion nowcasting_dataset/data_sources/gsp/gsp_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Model for output of GSP data """
import logging
import numpy as np

from nowcasting_dataset.data_sources.datasource_output import (
DataSourceOutput,
Expand All @@ -15,4 +16,11 @@ class GSP(DataSourceOutput):
__slots__ = ()
_expected_dimensions = ("time", "id")

# todo add validation here - https://github.com/openclimatefix/nowcasting_dataset/issues/233
@classmethod
def model_validation(cls, v):
""" Check that all values are non NaNs """
assert (~np.isnan(v.data)).all(), f"Some gsp data values are NaNs"
assert (v.data != np.Inf).all(), f"Some gsp data values are Infinite"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly combine the two asserts with

Suggested change
assert (~np.isnan(v.data)).all(), f"Some gsp data values are NaNs"
assert (v.data != np.Inf).all(), f"Some gsp data values are Infinite"
assert (np.isfinite(v.data)).all(), f"Some gsp data values are NaNs or infinite"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above, think I will keep them separate

assert (v.data >= 0).all(), f"Some gsp data values are below 0 {v.data.min()}"

return v
2 changes: 1 addition & 1 deletion nowcasting_dataset/data_sources/nwp/nwp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ class NWP(DataSourceOutput):
@classmethod
def model_validation(cls, v):
""" Check that all values are not NaNs """
assert (v.data != np.nan).all(), "Some nwp data values are NaNs"
assert (~np.isnan(v.data)).all(), "Some nwp data values are NaNs"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about checking for positive and negative infinity? with

Suggested change
assert (~np.isnan(v.data)).all(), "Some nwp data values are NaNs"
assert (~np.isfinite(v.data)).all(), "Some nwp data values are NaNs or infinite"

Copy link
Member

@JackKelly JackKelly Oct 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree about using np.isfinite to test for NaNs, +inf and -inf. But, if we use np.isfinite, then we'll also need to remove the ~ and tweak the string 🙂

Suggested change
assert (~np.isnan(v.data)).all(), "Some nwp data values are NaNs"
assert (np.isfinite(v.data)).all(), "Some nwp data values are NaNs or infinities"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm gona go with both isnan and isinf. I this leads to slightly more verbose error messages

For example
assert (~isnan(v.data)).all(), f"Some pv data values are NaNs"
assert (~isinf(v.data)).all(), f"Some pv data values are Infinite"

return v
4 changes: 2 additions & 2 deletions nowcasting_dataset/data_sources/pv/pv_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@ def get_example(
pv["x_coords"] = x_coords
pv["y_coords"] = y_coords

# pad out so that there are always 32 gsp
# pad out so that there are always 32 gsp, pad with zeros
pad_n = self.n_pv_systems_per_example - len(pv.id_index)
pv = pv.pad(id_index=(0, pad_n), data=((0, 0), (0, pad_n)))
pv = pv.pad(id_index=(0, pad_n), data=((0, 0), (0, pad_n)), constant_values=0)

pv.__setitem__("id_index", range(self.n_pv_systems_per_example))

Expand Down
10 changes: 9 additions & 1 deletion nowcasting_dataset/data_sources/pv/pv_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,12 @@ class PV(DataSourceOutput):
__slots__ = ()
_expected_dimensions = ("time", "id")

# todo add validation here - https://github.com/openclimatefix/nowcasting_dataset/issues/233
@classmethod
def model_validation(cls, v):
""" Check that all values are non NaNs """
assert (~np.isnan(v.data)).all(), f"Some pv data values are NaNs"
assert (v.data != np.Inf).all(), f"Some pv data values are Infinite"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, possibly combining the two might be a bit cleaner

Suggested change
assert (~np.isnan(v.data)).all(), f"Some pv data values are NaNs"
assert (v.data != np.Inf).all(), f"Some pv data values are Infinite"
assert (np.isfinite(v.data)).all(), f"Some pv data values are NaNs or infinite"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above, think I will keep them separate


assert (v.data >= 0).all(), f"Some pv data values are below 0"

return v
3 changes: 2 additions & 1 deletion nowcasting_dataset/data_sources/satellite/satellite_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ class Satellite(DataSourceOutput):
@classmethod
def model_validation(cls, v):
""" Check that all values are non negative """
assert (v.data != np.NaN).all(), f"Some satellite data values are NaNs"
assert (~np.isnan(v.data)).all(), f"Some satellite data values are NaNs"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as for NWP

Suggested change
assert (~np.isnan(v.data)).all(), f"Some satellite data values are NaNs"
assert (~np.isfinite(v.data)).all(), f"Some satellite data values are NaNs or infinite"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above 🙂

Suggested change
assert (~np.isnan(v.data)).all(), f"Some satellite data values are NaNs"
assert (np.isfinite(v.data)).all(), "Some nwp data values are NaNs"

assert (v.data != -1).all(), f"Some satellite data values are -1's"
return v
23 changes: 22 additions & 1 deletion nowcasting_dataset/data_sources/sun/sun_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,25 @@ class Sun(DataSourceOutput):
__slots__ = ()
_expected_dimensions = ("time",)

# todo add validation here - https://github.com/openclimatefix/nowcasting_dataset/issues/233
@classmethod
def model_validation(cls, v):
""" Check that all values are non NaNs """
assert (~np.isnan(v.elevation)).all(), f"Some elevation data values are NaNs"
assert (v.elevation != np.Inf).all(), f"Some elevation data values are Infinite"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly combine these two, or if we do want to keep them separate, changing it to

Suggested change
assert (~np.isnan(v.elevation)).all(), f"Some elevation data values are NaNs"
assert (v.elevation != np.Inf).all(), f"Some elevation data values are Infinite"
assert (~np.isnan(v.elevation)).all(), f"Some elevation data values are NaNs"
assert (~np.isinf(v.elevation)).all(), f"Some elevation data values are Infinite"

as I believe np.Inf only counts as positive infinity, and wouldn't check for negative infinity

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above, think I will keep them separate

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, good idea!

assert (~np.isnan(v.azimuth)).all(), f"Some azimuth data values are NaNs"
assert (v.azimuth != np.Inf).all(), f"Some azimuth data values are Infinite"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above


assert (0 <= v.azimuth).all(), f"Some azimuth data values are lower 0, {v.azimuth.min()}"
assert (
v.azimuth <= 360
).all(), f"Some azimuth data values are greater than 360, {v.azimuth.max()}"

assert (
-90 <= v.elevation
).all(), f"Some elevation data values are lower -90, {v.elevation.min()}"
assert (
v.elevation <= 90
).all(), f"Some elevation data values are greater than 90, {v.elevation.max()}"

return v
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ class Topographic(DataSourceOutput):
@classmethod
def model_validation(cls, v):
""" Check that all values are non NaNs """
assert (v.data != np.NaN).all(), f"Some topological data values are NaNs"
assert (~np.isnan(v.data)).all(), f"Some topological data values are NaNs"
assert (v.data != np.Inf).all(), f"Some topological data values are Infinite"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly just combining the two asserts might be a bit simpler?

Suggested change
assert (v.data != np.Inf).all(), f"Some topological data values are Infinite"
assert (np.isfinite(v.data)).all(), f"Some topological data values are Infinite or NaNs"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above, think I will keep them separate

return v
30 changes: 30 additions & 0 deletions tests/data_sources/gsp/test_gsp_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import tempfile
import pytest
import numpy as np

from nowcasting_dataset.data_sources.fake import gsp_fake
from nowcasting_dataset.data_sources.gsp.gsp_model import GSP


def test_gsp_init():
_ = gsp_fake(batch_size=4, seq_length_30=5, n_gsp_per_batch=6)


def test_gsp_validation():
gsp = gsp_fake(batch_size=4, seq_length_30=5, n_gsp_per_batch=6)

GSP.model_validation(gsp)

gsp.data[0, 0] = np.nan
with pytest.raises(Exception):
GSP.model_validation(gsp)


def test_gsp_save():

with tempfile.TemporaryDirectory() as dirpath:
gsp = gsp_fake(batch_size=4, seq_length_30=5, n_gsp_per_batch=6)
gsp.save_netcdf(path=dirpath, batch_i=0)

assert os.path.exists(f"{dirpath}/gsp/0.nc")
15 changes: 14 additions & 1 deletion tests/data_sources/satellite/test_satellite_model.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
import os
import tempfile
import pytest
import numpy as np

from nowcasting_dataset.data_sources.fake import satellite_fake
from nowcasting_dataset.data_sources.satellite.satellite_model import Satellite


def test_satellite_init():
_ = satellite_fake
_ = satellite_fake()


def test_satellite_validation():
sat = satellite_fake()

Satellite.model_validation(sat)

sat.data[0, 0] = np.nan
with pytest.raises(Exception):
Satellite.model_validation(sat)


def test_satellite_save():
Expand Down
4 changes: 0 additions & 4 deletions tests/data_sources/sun/test_sun_data_source.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
from nowcasting_dataset.data_sources.sun.sun_data_source import SunDataSource
from datetime import datetime

# from nowcasting_dataset.dataset.example import Example
from nowcasting_dataset.consts import SUN_ELEVATION_ANGLE, SUN_AZIMUTH_ANGLE
import pandas as pd


Expand Down
50 changes: 50 additions & 0 deletions tests/data_sources/sun/test_sun_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import tempfile
import pytest
import numpy as np

from nowcasting_dataset.data_sources.fake import sun_fake
from nowcasting_dataset.data_sources.sun.sun_model import Sun


def test_sun_init():
_ = sun_fake(batch_size=4, seq_length_5=17)


def test_sun_validation():
sun = sun_fake(batch_size=4, seq_length_5=17)

Sun.model_validation(sun)

sun.elevation[0, 0] = np.nan
with pytest.raises(Exception):
Sun.model_validation(sun)


def test_sun_validation_elevation():
sun = sun_fake(batch_size=4, seq_length_5=17)

Sun.model_validation(sun)

sun.elevation[0, 0] = 1000
with pytest.raises(Exception):
Sun.model_validation(sun)


def test_sun_validation_azimuth():
sun = sun_fake(batch_size=4, seq_length_5=17)

Sun.model_validation(sun)

sun.azimuth[0, 0] = 1000
with pytest.raises(Exception):
Sun.model_validation(sun)


def test_sun_save():

with tempfile.TemporaryDirectory() as dirpath:
sun = sun_fake(batch_size=4, seq_length_5=17)
sun.save_netcdf(path=dirpath, batch_i=0)

assert os.path.exists(f"{dirpath}/sun/0.nc")