diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 87559ee9b02..59c7c048c3f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -41,6 +41,9 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- Added examples to docstrings of :py:meth:`Dataset.tail`, :py:meth:`Dataset.head`, :py:meth:`Dataset.dropna`, + :py:meth:`Dataset.ffill`, :py:meth:`Dataset.bfill`, :py:meth:`Dataset.set_coords`, :py:meth:`Dataset.reset_coords` + (:issue:`6793`, :pull:`7936`) By `Harshitha `_ . - Added page on wrapping chunked numpy-like arrays as alternatives to dask arrays. (:pull:`7951`) By `Tom Nicholas `_. - Expanded the page on wrapping numpy-like "duck" arrays. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8898cff789a..7bd92ea32a0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1742,6 +1742,33 @@ def set_coords(self: T_Dataset, names: Hashable | Iterable[Hashable]) -> T_Datas names : hashable or iterable of hashable Name(s) of variables in this dataset to convert into coordinates. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "pressure": ("time", [1.013, 1.2, 3.5]), + ... "time": pd.date_range("2023-01-01", periods=3), + ... } + ... ) + >>> dataset + + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 2023-01-03 + Data variables: + pressure (time) float64 1.013 1.2 3.5 + + >>> dataset.set_coords("pressure") + + Dimensions: (time: 3) + Coordinates: + pressure (time) float64 1.013 1.2 3.5 + * time (time) datetime64[ns] 2023-01-01 2023-01-02 2023-01-03 + Data variables: + *empty* + + On calling ``set_coords`` , these data variables are converted to coordinates, as shown in the final dataset. + Returns ------- Dataset @@ -1780,9 +1807,66 @@ def reset_coords( If True, remove coordinates instead of converting them into variables. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "lat", "lon"], + ... [[[25, 26], [27, 28]], [[29, 30], [31, 32]]], + ... ), + ... "precipitation": ( + ... ["time", "lat", "lon"], + ... [[[0.5, 0.8], [0.2, 0.4]], [[0.3, 0.6], [0.7, 0.9]]], + ... ), + ... }, + ... coords={ + ... "time": pd.date_range(start="2023-01-01", periods=2), + ... "lat": [40, 41], + ... "lon": [-80, -79], + ... "altitude": 1000, + ... }, + ... ) + + # Dataset before resetting coordinates + + >>> dataset + + Dimensions: (time: 2, lat: 2, lon: 2) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 + * lat (lat) int64 40 41 + * lon (lon) int64 -80 -79 + altitude int64 1000 + Data variables: + temperature (time, lat, lon) int64 25 26 27 28 29 30 31 32 + precipitation (time, lat, lon) float64 0.5 0.8 0.2 0.4 0.3 0.6 0.7 0.9 + + # Reset the 'altitude' coordinate + + >>> dataset_reset = dataset.reset_coords("altitude") + + # Dataset after resetting coordinates + + >>> dataset_reset + + Dimensions: (time: 2, lat: 2, lon: 2) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 + * lat (lat) int64 40 41 + * lon (lon) int64 -80 -79 + Data variables: + temperature (time, lat, lon) int64 25 26 27 28 29 30 31 32 + precipitation (time, lat, lon) float64 0.5 0.8 0.2 0.4 0.3 0.6 0.7 0.9 + altitude int64 1000 + Returns ------- Dataset + + See Also + -------- + Dataset.set_coords """ if names is None: names = self._coord_names - set(self._indexes) @@ -2742,6 +2826,50 @@ def head( The keyword arguments form of ``indexers``. One of indexers or indexers_kwargs must be provided. + Examples + -------- + >>> dates = pd.date_range(start="2023-01-01", periods=5) + >>> pageviews = [1200, 1500, 900, 1800, 2000] + >>> visitors = [800, 1000, 600, 1200, 1500] + >>> dataset = xr.Dataset( + ... { + ... "pageviews": (("date"), pageviews), + ... "visitors": (("date"), visitors), + ... }, + ... coords={"date": dates}, + ... ) + >>> busiest_days = dataset.sortby("pageviews", ascending=False) + >>> busiest_days.head() + + Dimensions: (date: 5) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 ... 2023-01-03 + Data variables: + pageviews (date) int64 2000 1800 1500 1200 900 + visitors (date) int64 1500 1200 1000 800 600 + + # Retrieve the 3 most busiest days in terms of pageviews + + >>> busiest_days.head(3) + + Dimensions: (date: 3) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 2023-01-02 + Data variables: + pageviews (date) int64 2000 1800 1500 + visitors (date) int64 1500 1200 1000 + + # Using a dictionary to specify the number of elements for specific dimensions + + >>> busiest_days.head({"date": 3}) + + Dimensions: (date: 3) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 2023-01-02 + Data variables: + pageviews (date) int64 2000 1800 1500 + visitors (date) int64 1500 1200 1000 + See Also -------- Dataset.tail @@ -2788,6 +2916,48 @@ def tail( The keyword arguments form of ``indexers``. One of indexers or indexers_kwargs must be provided. + Examples + -------- + >>> activity_names = ["Walking", "Running", "Cycling", "Swimming", "Yoga"] + >>> durations = [30, 45, 60, 45, 60] # in minutes + >>> energies = [150, 300, 250, 400, 100] # in calories + >>> dataset = xr.Dataset( + ... { + ... "duration": (["activity"], durations), + ... "energy_expenditure": (["activity"], energies), + ... }, + ... coords={"activity": activity_names}, + ... ) + >>> sorted_dataset = dataset.sortby("energy_expenditure", ascending=False) + >>> sorted_dataset + + Dimensions: (activity: 5) + Coordinates: + * activity (activity) >> sorted_dataset.tail(3) + + Dimensions: (activity: 3) + Coordinates: + * activity (activity) >> sorted_dataset.tail({"activity": 3}) + + Dimensions: (activity: 3) + Coordinates: + * activity (activity) >> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "location"], + ... [[23.4, 24.1], [np.nan, 22.1], [21.8, 24.2], [20.5, 25.3]], + ... ) + ... }, + ... coords={"time": [1, 2, 3, 4], "location": ["A", "B"]}, + ... ) + >>> dataset + + Dimensions: (time: 4, location: 2) + Coordinates: + * time (time) int64 1 2 3 4 + * location (location) >> dataset.dropna(dim="time") + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) >> dataset.dropna(dim="time", how="any") + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) >> dataset.dropna(dim="time", how="all") + + Dimensions: (time: 4, location: 2) + Coordinates: + * time (time) int64 1 2 3 4 + * location (location) >> dataset.dropna(dim="time", thresh=2) + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) T_Dataset Parameters ---------- dim : Hashable - Specifies the dimension along which to propagate values when - filling. + Specifies the dimension along which to propagate values when filling. limit : int or None, optional The maximum number of consecutive NaN values to forward fill. In other words, if there is a gap with more than this number of @@ -5886,9 +6119,48 @@ def ffill(self: T_Dataset, dim: Hashable, limit: int | None = None) -> T_Dataset than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Examples + -------- + >>> time = pd.date_range("2023-01-01", periods=10, freq="D") + >>> data = np.array( + ... [1, np.nan, np.nan, np.nan, 5, np.nan, np.nan, 8, np.nan, 10] + ... ) + >>> dataset = xr.Dataset({"data": (("time",), data)}, coords={"time": time}) + >>> dataset + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan nan nan 5.0 nan nan 8.0 nan 10.0 + + # Perform forward fill (ffill) on the dataset + + >>> dataset.ffill(dim="time") + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 1.0 1.0 1.0 5.0 5.0 5.0 8.0 8.0 10.0 + + # Limit the forward filling to a maximum of 2 consecutive NaN values + + >>> dataset.ffill(dim="time", limit=2) + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 1.0 1.0 nan 5.0 5.0 5.0 8.0 8.0 10.0 + Returns ------- Dataset + + See Also + -------- + Dataset.bfill """ from xarray.core.missing import _apply_over_vars_with_dim, ffill @@ -5912,9 +6184,48 @@ def bfill(self: T_Dataset, dim: Hashable, limit: int | None = None) -> T_Dataset than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Examples + -------- + >>> time = pd.date_range("2023-01-01", periods=10, freq="D") + >>> data = np.array( + ... [1, np.nan, np.nan, np.nan, 5, np.nan, np.nan, 8, np.nan, 10] + ... ) + >>> dataset = xr.Dataset({"data": (("time",), data)}, coords={"time": time}) + >>> dataset + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan nan nan 5.0 nan nan 8.0 nan 10.0 + + # filled dataset, fills NaN values by propagating values backward + + >>> dataset.bfill(dim="time") + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 5.0 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + + # Limit the backward filling to a maximum of 2 consecutive NaN values + + >>> dataset.bfill(dim="time", limit=2) + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + Returns ------- Dataset + + See Also + -------- + Dataset.ffill """ from xarray.core.missing import _apply_over_vars_with_dim, bfill