Merge pull request #3305 from plotly/extended_data

nicolaskruchten · web-flow · commit 612d3f922a66 · 2021-07-17T08:08:11.000-04:00
add some extra options to various demo datasets
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## UNRELEASED
 
+
+### Added
+  - Extra flags were added to the `gapminder` and `stocks` dataset to facilitate testing, documentation and demos [#3305](https://github.com/plotly/plotly.py/issues/3305)
+
 ### Fixed
   - Fixed regression introduced in version 5.0.0 where pandas/numpy arrays with `dtype` of Object were being converted to `list` values when added to a Figure ([#3292](https://github.com/plotly/plotly.py/issues/3292), [#3293](https://github.com/plotly/plotly.py/pull/3293))
 
diff --git a/packages/python/plotly/plotly/data/__init__.py b/packages/python/plotly/plotly/data/__init__.py
@@ -3,78 +3,83 @@
 """
 
 
-def gapminder():
+def gapminder(datetimes=False, centroids=False, year=None):
     """
-Each row represents a country on a given year.
+    Each row represents a country on a given year.
 
-https://www.gapminder.org/data/
+    https://www.gapminder.org/data/
 
-Returns:
-    A `pandas.DataFrame` with 1704 rows and the following columns:
-    `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
-    'iso_alpha', 'iso_num']`.
-"""
-    return _get_dataset("gapminder")
+    Returns:
+        A `pandas.DataFrame` with 1704 rows and the following columns:
+        `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
+        'iso_alpha', 'iso_num']`.
+        If `datetimes` is True, the 'year' column will be a datetime column
+        If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon']
+        If `year` is an integer, the dataset will be filtered for that year
+    """
+    df = _get_dataset("gapminder")
+    if datetimes:
+        df["year"] = (df["year"].astype(str) + "-01-01").astype("datetime64[ns]")
+    if not centroids:
+        df.drop(["centroid_lat", "centroid_lon"], axis=1, inplace=True)
+    if year:
+        df = df.query("year == %d" % year)
+    return df
 
 
 def tips():
     """
-Each row represents a restaurant bill.
+    Each row represents a restaurant bill.
 
-https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
+    https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
 
-Returns:
-    A `pandas.DataFrame` with 244 rows and the following columns:
-    `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.
-"""
+    Returns:
+        A `pandas.DataFrame` with 244 rows and the following columns:
+        `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`."""
     return _get_dataset("tips")
 
 
 def iris():
     """
-Each row represents a flower.
+    Each row represents a flower.
 
-https://en.wikipedia.org/wiki/Iris_flower_data_set
+    https://en.wikipedia.org/wiki/Iris_flower_data_set
 
-Returns:
-    A `pandas.DataFrame` with 150 rows and the following columns:
-    `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.
-"""
+    Returns:
+        A `pandas.DataFrame` with 150 rows and the following columns:
+        `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`."""
     return _get_dataset("iris")
 
 
 def wind():
     """
-Each row represents a level of wind intensity in a cardinal direction, and its frequency.
+    Each row represents a level of wind intensity in a cardinal direction, and its frequency.
 
-Returns:
-    A `pandas.DataFrame` with 128 rows and the following columns:
-    `['direction', 'strength', 'frequency']`.
-"""
+    Returns:
+        A `pandas.DataFrame` with 128 rows and the following columns:
+        `['direction', 'strength', 'frequency']`."""
     return _get_dataset("wind")
 
 
 def election():
     """
-Each row represents voting results for an electoral district in the 2013 Montreal
-mayoral election.
+    Each row represents voting results for an electoral district in the 2013 Montreal
+    mayoral election.
 
-Returns:
-    A `pandas.DataFrame` with 58 rows and the following columns:
-    `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.
-"""
+    Returns:
+        A `pandas.DataFrame` with 58 rows and the following columns:
+        `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`."""
     return _get_dataset("election")
 
 
 def election_geojson():
     """
-Each feature represents an electoral district in the 2013 Montreal mayoral election.
+    Each feature represents an electoral district in the 2013 Montreal mayoral election.
 
-Returns:
-    A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
-    is an electoral district numerical ID and whose `district` property is the ID and
-    district name.
-"""
+    Returns:
+        A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
+        is an electoral district numerical ID and whose `district` property is the ID and
+        district name."""
     import gzip
     import json
     import os
@@ -92,27 +97,28 @@ def election_geojson():
 
 def carshare():
     """
-Each row represents the availability of car-sharing services near the centroid of a zone
-in Montreal over a month-long period.
+    Each row represents the availability of car-sharing services near the centroid of a zone
+    in Montreal over a month-long period.
 
-Returns:
-    A `pandas.DataFrame` with 249 rows and the following columns:
-    `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.
-"""
+    Returns:
+        A `pandas.DataFrame` with 249 rows and the following columns:
+        `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`."""
     return _get_dataset("carshare")
 
 
-def stocks(indexed=False):
+def stocks(indexed=False, datetimes=False):
     """
-Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
-
-Returns:
-    A `pandas.DataFrame` with 100 rows and the following columns:
-    `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
-    If `indexed` is True, the 'date' column is used as the index and the column index
-    is named 'company'
-"""
+    Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
+
+    Returns:
+        A `pandas.DataFrame` with 100 rows and the following columns:
+        `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
+        If `indexed` is True, the 'date' column is used as the index and the column index
+        If `datetimes` is True, the 'date' column will be a datetime column
+        is named 'company'"""
     df = _get_dataset("stocks")
+    if datetimes:
+        df["date"] = df["date"].astype("datetime64[ns]")
     if indexed:
         df = df.set_index("date")
         df.columns.name = "company"
@@ -121,15 +127,14 @@ def stocks(indexed=False):
 
 def experiment(indexed=False):
     """
-Each row in this wide dataset represents the results of 100 simulated participants
-on three hypothetical experiments, along with their gender and control/treatment group.
+    Each row in this wide dataset represents the results of 100 simulated participants
+    on three hypothetical experiments, along with their gender and control/treatment group.
 
 
-Returns:
-    A `pandas.DataFrame` with 100 rows and the following columns:
-    `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
-    If `indexed` is True, the data frame index is named "participant"
-"""
+    Returns:
+        A `pandas.DataFrame` with 100 rows and the following columns:
+        `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
+        If `indexed` is True, the data frame index is named "participant" """
     df = _get_dataset("experiment")
     if indexed:
         df.index.name = "participant"
@@ -138,15 +143,14 @@ def experiment(indexed=False):
 
 def medals_wide(indexed=False):
     """
-This dataset represents the medal table for Olympic Short Track Speed Skating for the
-top three nations as of 2020.
-
-Returns:
-    A `pandas.DataFrame` with 3 rows and the following columns:
-    `['nation', 'gold', 'silver', 'bronze']`.
-    If `indexed` is True, the 'nation' column is used as the index and the column index
-    is named 'medal'
-"""
+    This dataset represents the medal table for Olympic Short Track Speed Skating for the
+    top three nations as of 2020.
+
+    Returns:
+        A `pandas.DataFrame` with 3 rows and the following columns:
+        `['nation', 'gold', 'silver', 'bronze']`.
+        If `indexed` is True, the 'nation' column is used as the index and the column index
+        is named 'medal'"""
     df = _get_dataset("medals")
     if indexed:
         df = df.set_index("nation")
@@ -156,14 +160,13 @@ def medals_wide(indexed=False):
 
 def medals_long(indexed=False):
     """
-This dataset represents the medal table for Olympic Short Track Speed Skating for the
-top three nations as of 2020.
+    This dataset represents the medal table for Olympic Short Track Speed Skating for the
+    top three nations as of 2020.
 
-Returns:
-    A `pandas.DataFrame` with 9 rows and the following columns:
-    `['nation', 'medal', 'count']`.
-    If `indexed` is True, the 'nation' column is used as the index.
-"""
+    Returns:
+        A `pandas.DataFrame` with 9 rows and the following columns:
+        `['nation', 'medal', 'count']`.
+        If `indexed` is True, the 'nation' column is used as the index."""
     df = _get_dataset("medals").melt(
         id_vars=["nation"], value_name="count", var_name="medal"
     )
diff --git a/packages/python/plotly/plotly/package_data/datasets/gapminder.csv.gz b/packages/python/plotly/plotly/package_data/datasets/gapminder.csv.gz