3
3
"""
4
4
5
5
6
- def gapminder ():
6
+ def gapminder (datetimes = False , centroids = False , year = None ):
7
7
"""
8
- Each row represents a country on a given year.
8
+ Each row represents a country on a given year.
9
9
10
- https://www.gapminder.org/data/
10
+ https://www.gapminder.org/data/
11
11
12
- Returns:
13
- A `pandas.DataFrame` with 1704 rows and the following columns:
14
- `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
15
- 'iso_alpha', 'iso_num']`.
16
- """
17
- return _get_dataset ("gapminder" )
12
+ Returns:
13
+ A `pandas.DataFrame` with 1704 rows and the following columns:
14
+ `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
15
+ 'iso_alpha', 'iso_num']`.
16
+ If `datetimes` is True, the 'year' column will be a datetime column
17
+ If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon']
18
+ If `year` is an integer, the dataset will be filtered for that year
19
+ """
20
+ df = _get_dataset ("gapminder" )
21
+ if datetimes :
22
+ df ["year" ] = (df ["year" ].astype (str ) + "-01-01" ).astype ("datetime64[ns]" )
23
+ if not centroids :
24
+ df .drop (["centroid_lat" , "centroid_lon" ], axis = 1 , inplace = True )
25
+ if year :
26
+ df = df .query ("year == %d" % year )
27
+ return df
18
28
19
29
20
30
def tips ():
21
31
"""
22
- Each row represents a restaurant bill.
32
+ Each row represents a restaurant bill.
23
33
24
- https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
34
+ https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
25
35
26
- Returns:
27
- A `pandas.DataFrame` with 244 rows and the following columns:
28
- `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.
29
- """
36
+ Returns:
37
+ A `pandas.DataFrame` with 244 rows and the following columns:
38
+ `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`."""
30
39
return _get_dataset ("tips" )
31
40
32
41
33
42
def iris ():
34
43
"""
35
- Each row represents a flower.
44
+ Each row represents a flower.
36
45
37
- https://en.wikipedia.org/wiki/Iris_flower_data_set
46
+ https://en.wikipedia.org/wiki/Iris_flower_data_set
38
47
39
- Returns:
40
- A `pandas.DataFrame` with 150 rows and the following columns:
41
- `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.
42
- """
48
+ Returns:
49
+ A `pandas.DataFrame` with 150 rows and the following columns:
50
+ `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`."""
43
51
return _get_dataset ("iris" )
44
52
45
53
46
54
def wind ():
47
55
"""
48
- Each row represents a level of wind intensity in a cardinal direction, and its frequency.
56
+ Each row represents a level of wind intensity in a cardinal direction, and its frequency.
49
57
50
- Returns:
51
- A `pandas.DataFrame` with 128 rows and the following columns:
52
- `['direction', 'strength', 'frequency']`.
53
- """
58
+ Returns:
59
+ A `pandas.DataFrame` with 128 rows and the following columns:
60
+ `['direction', 'strength', 'frequency']`."""
54
61
return _get_dataset ("wind" )
55
62
56
63
57
64
def election ():
58
65
"""
59
- Each row represents voting results for an electoral district in the 2013 Montreal
60
- mayoral election.
66
+ Each row represents voting results for an electoral district in the 2013 Montreal
67
+ mayoral election.
61
68
62
- Returns:
63
- A `pandas.DataFrame` with 58 rows and the following columns:
64
- `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.
65
- """
69
+ Returns:
70
+ A `pandas.DataFrame` with 58 rows and the following columns:
71
+ `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`."""
66
72
return _get_dataset ("election" )
67
73
68
74
69
75
def election_geojson ():
70
76
"""
71
- Each feature represents an electoral district in the 2013 Montreal mayoral election.
77
+ Each feature represents an electoral district in the 2013 Montreal mayoral election.
72
78
73
- Returns:
74
- A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
75
- is an electoral district numerical ID and whose `district` property is the ID and
76
- district name.
77
- """
79
+ Returns:
80
+ A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
81
+ is an electoral district numerical ID and whose `district` property is the ID and
82
+ district name."""
78
83
import gzip
79
84
import json
80
85
import os
@@ -92,27 +97,28 @@ def election_geojson():
92
97
93
98
def carshare ():
94
99
"""
95
- Each row represents the availability of car-sharing services near the centroid of a zone
96
- in Montreal over a month-long period.
100
+ Each row represents the availability of car-sharing services near the centroid of a zone
101
+ in Montreal over a month-long period.
97
102
98
- Returns:
99
- A `pandas.DataFrame` with 249 rows and the following columns:
100
- `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.
101
- """
103
+ Returns:
104
+ A `pandas.DataFrame` with 249 rows and the following columns:
105
+ `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`."""
102
106
return _get_dataset ("carshare" )
103
107
104
108
105
- def stocks (indexed = False ):
109
+ def stocks (indexed = False , datetimes = False ):
106
110
"""
107
- Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
108
-
109
- Returns:
110
- A `pandas.DataFrame` with 100 rows and the following columns:
111
- `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
112
- If `indexed` is True, the 'date' column is used as the index and the column index
113
- is named 'company'
114
- """
111
+ Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
112
+
113
+ Returns:
114
+ A `pandas.DataFrame` with 100 rows and the following columns:
115
+ `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
116
+ If `indexed` is True, the 'date' column is used as the index and the column index
117
+ If `datetimes` is True, the 'date' column will be a datetime column
118
+ is named 'company' """
115
119
df = _get_dataset ("stocks" )
120
+ if datetimes :
121
+ df ["date" ] = df ["date" ].astype ("datetime64[ns]" )
116
122
if indexed :
117
123
df = df .set_index ("date" )
118
124
df .columns .name = "company"
@@ -121,15 +127,14 @@ def stocks(indexed=False):
121
127
122
128
def experiment (indexed = False ):
123
129
"""
124
- Each row in this wide dataset represents the results of 100 simulated participants
125
- on three hypothetical experiments, along with their gender and control/treatment group.
130
+ Each row in this wide dataset represents the results of 100 simulated participants
131
+ on three hypothetical experiments, along with their gender and control/treatment group.
126
132
127
133
128
- Returns:
129
- A `pandas.DataFrame` with 100 rows and the following columns:
130
- `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
131
- If `indexed` is True, the data frame index is named "participant"
132
- """
134
+ Returns:
135
+ A `pandas.DataFrame` with 100 rows and the following columns:
136
+ `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
137
+ If `indexed` is True, the data frame index is named "participant" """
133
138
df = _get_dataset ("experiment" )
134
139
if indexed :
135
140
df .index .name = "participant"
@@ -138,15 +143,14 @@ def experiment(indexed=False):
138
143
139
144
def medals_wide (indexed = False ):
140
145
"""
141
- This dataset represents the medal table for Olympic Short Track Speed Skating for the
142
- top three nations as of 2020.
143
-
144
- Returns:
145
- A `pandas.DataFrame` with 3 rows and the following columns:
146
- `['nation', 'gold', 'silver', 'bronze']`.
147
- If `indexed` is True, the 'nation' column is used as the index and the column index
148
- is named 'medal'
149
- """
146
+ This dataset represents the medal table for Olympic Short Track Speed Skating for the
147
+ top three nations as of 2020.
148
+
149
+ Returns:
150
+ A `pandas.DataFrame` with 3 rows and the following columns:
151
+ `['nation', 'gold', 'silver', 'bronze']`.
152
+ If `indexed` is True, the 'nation' column is used as the index and the column index
153
+ is named 'medal'"""
150
154
df = _get_dataset ("medals" )
151
155
if indexed :
152
156
df = df .set_index ("nation" )
@@ -156,14 +160,13 @@ def medals_wide(indexed=False):
156
160
157
161
def medals_long (indexed = False ):
158
162
"""
159
- This dataset represents the medal table for Olympic Short Track Speed Skating for the
160
- top three nations as of 2020.
163
+ This dataset represents the medal table for Olympic Short Track Speed Skating for the
164
+ top three nations as of 2020.
161
165
162
- Returns:
163
- A `pandas.DataFrame` with 9 rows and the following columns:
164
- `['nation', 'medal', 'count']`.
165
- If `indexed` is True, the 'nation' column is used as the index.
166
- """
166
+ Returns:
167
+ A `pandas.DataFrame` with 9 rows and the following columns:
168
+ `['nation', 'medal', 'count']`.
169
+ If `indexed` is True, the 'nation' column is used as the index."""
167
170
df = _get_dataset ("medals" ).melt (
168
171
id_vars = ["nation" ], value_name = "count" , var_name = "medal"
169
172
)
0 commit comments