Skip to content

Commit abeb576

Browse files
committed
boxplot: handle non-numeric cols & duplicate rows
1 parent ec3e75d commit abeb576

File tree

5 files changed

+74
-29
lines changed

5 files changed

+74
-29
lines changed

rest_pandas/serializers.py

+31-17
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from rest_framework import serializers
22
from pandas import DataFrame
3+
from pandas.api.types import is_numeric_dtype
34
from django.core.exceptions import ImproperlyConfigured
45
import datetime
6+
from collections import OrderedDict
57

68

79
class PandasSerializer(serializers.ListSerializer):
@@ -222,12 +224,15 @@ def get_index(self, dataframe):
222224
group_field = self.get_group_field()
223225
date_field = self.get_date_field()
224226
header_fields = self.get_header_fields()
227+
extra_index_fields = self.get_extra_index_fields()
225228

229+
index = []
226230
if date_field:
227-
group_fields = [date_field, group_field]
228-
else:
229-
group_fields = [group_field]
230-
return group_fields + header_fields
231+
index.append(date_field)
232+
index += extra_index_fields
233+
index.append(group_field)
234+
index += header_fields
235+
return index
231236

232237
def transform_dataframe(self, dataframe):
233238
"""
@@ -255,35 +260,30 @@ def transform_dataframe(self, dataframe):
255260
interval = None
256261

257262
# Compute stats for each column, potentially grouped by year
258-
all_stats = []
263+
series_infos = OrderedDict()
259264
for header, series in groups.items():
260265
if interval:
261266
series_stats = self.boxplots_for_interval(series, interval)
262267
else:
263-
interval = None
264268
series_stats = [self.compute_boxplot(series)]
265269

266-
series_infos = []
267270
for series_stat in series_stats:
268-
series_info = {}
269271
if isinstance(header, tuple):
270272
value_name = header[0]
271273
col_values = header[1:]
272274
else:
273275
value_name = header
274276
col_values = []
275-
col_names = zip(dataframe.columns.names[1:], col_values)
276-
for col_name, value in col_names:
277-
series_info[col_name] = value
277+
col_names = tuple(zip(dataframe.columns.names[1:], col_values))
278+
if interval in series_stat:
279+
col_names += ((interval, series_stat[interval]),)
280+
series_infos.setdefault(col_names, dict(col_names))
281+
series_info = series_infos[col_names]
278282
for stat_name, val in series_stat.items():
279-
if stat_name == interval:
280-
series_info[stat_name] = val
281-
else:
283+
if stat_name != interval:
282284
series_info[value_name + '-' + stat_name] = val
283-
series_infos.append(series_info)
284-
all_stats += series_infos
285285

286-
dataframe = DataFrame(all_stats)
286+
dataframe = DataFrame(list(series_infos.values()))
287287
if 'series' in grouping:
288288
index = header_fields + [group_field]
289289
unstack = len(header_fields)
@@ -336,11 +336,19 @@ def compute_boxplot(self, series):
336336
series = series[series.notnull()]
337337
if len(series.values) == 0:
338338
return {}
339+
elif not is_numeric_dtype(series):
340+
return self.non_numeric_stats(series)
339341
stats = boxplot_stats(list(series.values))[0]
340342
stats['count'] = len(series.values)
341343
stats['fliers'] = "|".join(map(str, stats['fliers']))
342344
return stats
343345

346+
def non_numeric_stats(self, series):
347+
return {
348+
'count': len(series),
349+
'mode': series.mode()[0],
350+
}
351+
344352
def get_group_field(self):
345353
"""
346354
Categorical field to group datasets by.
@@ -359,6 +367,12 @@ def get_header_fields(self):
359367
"""
360368
return self.get_meta_option('boxplot_header', [])
361369

370+
def get_extra_index_fields(self):
371+
"""
372+
Fields that identify each row but don't need to be considered for plot
373+
"""
374+
return self.get_meta_option('boxplot_extra_index', [])
375+
362376

363377
class SimpleSerializer(serializers.Serializer):
364378
"""

tests/test_complex.py

+23-10
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,19 @@ def setUp(self):
3030
('site2', 'flow', 'cfs', '2015-01-04', 'routine', 0.3, None),
3131
('site2', 'flow', 'cfs', '2015-01-05', 'routine', 0.8, None),
3232
)
33-
for site, parameter, units, date, type, value, flag in data:
34-
ComplexTimeSeries.objects.create(
35-
site=site,
36-
parameter=parameter,
37-
units=units,
38-
date=date,
39-
type=type,
40-
value=value,
41-
flag=flag,
42-
)
33+
for row in data:
34+
self.create_row(*row)
35+
36+
def create_row(self, site, parameter, units, date, type, value, flag):
37+
ComplexTimeSeries.objects.create(
38+
site=site,
39+
parameter=parameter,
40+
units=units,
41+
date=date,
42+
type=type,
43+
value=value,
44+
flag=flag,
45+
)
4346

4447
def test_complex_series(self):
4548
response = self.client.get("/complextimeseries.csv")
@@ -285,5 +288,15 @@ def test_complex_boxplot_year(self):
285288
self.assertEqual(round(stats['value-mean'], 5), 0.56111)
286289
self.assertEqual(stats['value-whishi'], 1.5)
287290

291+
@unittest.skipUnless(HAS_MATPLOTLIB, "requires matplotlib")
292+
def test_complex_boxplot_extra(self):
293+
self.create_row(
294+
'site1', 'flow', 'cfs', '2015-01-01', 'routine', 0.3, None
295+
)
296+
with self.assertRaises(ValueError):
297+
response = self.client.get("/complexboxplot.csv")
298+
response = self.client.get("/complexboxplotextra.csv")
299+
self.assertEqual(1, len(response.data))
300+
288301
def parse_csv(self, response):
289302
return parse_csv(response.content.decode('utf-8'))

tests/testapp/serializers.py

+10
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ class Meta(ComplexTimeSeriesSerializer.Meta):
5757
pandas_boxplot_header = ['units', 'parameter']
5858

5959

60+
class ComplexBoxplotExtraSerializer(ComplexTimeSeriesSerializer):
61+
class Meta:
62+
model = ComplexTimeSeries
63+
fields = '__all__'
64+
pandas_boxplot_group = 'site'
65+
pandas_boxplot_date = 'date'
66+
pandas_boxplot_extra_index = ['id'] # Ensure row uniqueness
67+
pandas_boxplot_header = ['units', 'parameter']
68+
69+
6070
class NotUnstackableSerializer(ModelSerializer):
6171
class Meta:
6272
model = MultiTimeSeries

tests/testapp/urls.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
TimeSeriesMixedRendererView, TimeSeriesMixinView, TimeSeriesNoMixinView,
88
DjangoPandasView, TimeSeriesViewSet,
99
MultiTimeSeriesView, MultiScatterView, MultiBoxplotView,
10-
ComplexTimeSeriesView, ComplexScatterView, ComplexBoxplotView,
10+
ComplexTimeSeriesView, ComplexScatterView,
11+
ComplexBoxplotView, ComplexBoxplotExtraView,
1112
CustomIndexSeriesView,
1213
)
1314

@@ -29,6 +30,7 @@
2930
url(r'^complextimeseries$', ComplexTimeSeriesView.as_view()),
3031
url(r'^complexscatter$', ComplexScatterView.as_view()),
3132
url(r'^complexboxplot$', ComplexBoxplotView.as_view()),
33+
url(r'^complexboxplotextra$', ComplexBoxplotExtraView.as_view()),
3234
url(r'^customindex$', CustomIndexSeriesView.as_view()),
3335
]
3436
urlpatterns = format_suffix_patterns(urlpatterns)

tests/testapp/views.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
TimeSeriesSerializer, TimeSeriesNoIdSerializer,
1313
MultiTimeSeriesSerializer,
1414
ComplexTimeSeriesSerializer, ComplexScatterSerializer,
15-
ComplexBoxplotSerializer,
15+
ComplexBoxplotSerializer, ComplexBoxplotExtraSerializer,
1616
CustomIndexSeriesSerializer,
1717
)
1818
import pandas as pd
@@ -127,6 +127,12 @@ class ComplexBoxplotView(PandasView):
127127
pandas_serializer_class = PandasBoxplotSerializer
128128

129129

130+
class ComplexBoxplotExtraView(PandasView):
131+
queryset = ComplexTimeSeries.objects.all()
132+
serializer_class = ComplexBoxplotExtraSerializer
133+
pandas_serializer_class = PandasBoxplotSerializer
134+
135+
130136
class CustomIndexSeriesView(PandasView):
131137
queryset = CustomIndexSeries.objects.all()
132138
serializer_class = CustomIndexSeriesSerializer

0 commit comments

Comments
 (0)