Skip to content

Commit 68f6268

Browse files
committed
PERF: Improve performance of CustomBusinessDay
1 parent fccd7fe commit 68f6268

File tree

5 files changed

+145
-52
lines changed

5 files changed

+145
-52
lines changed

doc/source/v0.15.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,7 @@ Performance
833833
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
834834
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
835835
- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`)
836+
- Performance improvement in ``CustomBusinessDay``, ``CustomBusinessMonth`` (:issue:`8236`)
836837

837838

838839

pandas/tseries/offsets.py

+74-43
Original file line numberDiff line numberDiff line change
@@ -225,12 +225,12 @@ def _should_cache(self):
225225
return self.isAnchored() and self._cacheable
226226

227227
def _params(self):
228-
attrs = [(k, v) for k, v in compat.iteritems(vars(self))
229-
if (k not in ['kwds', 'name', 'normalize',
230-
'busdaycalendar']) and (k[0] != '_')]
231-
attrs.extend(list(self.kwds.items()))
228+
all_paras = dict(list(vars(self).items()) + list(self.kwds.items()))
229+
if 'holidays' in all_paras and not all_paras['holidays']:
230+
all_paras.pop('holidays')
231+
exclude = ['kwds', 'name','normalize', 'calendar']
232+
attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude ) and (k[0] != '_')]
232233
attrs = sorted(set(attrs))
233-
234234
params = tuple([str(self.__class__)] + attrs)
235235
return params
236236

@@ -547,38 +547,57 @@ class CustomBusinessDay(BusinessDay):
547547
holidays : list
548548
list/array of dates to exclude from the set of valid business days,
549549
passed to ``numpy.busdaycalendar``
550-
calendar : HolidayCalendar instance
551-
instance of AbstractHolidayCalendar that provide the list of holidays
550+
calendar : pd.HolidayCalendar or np.busdaycalendar
552551
"""
553-
554552
_cacheable = False
555553
_prefix = 'C'
556554

557-
def __init__(self, n=1, normalize=False, **kwds):
555+
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
556+
holidays=None, calendar=None, **kwds):
558557
self.n = int(n)
559558
self.normalize = normalize
560559
self.kwds = kwds
561560
self.offset = kwds.get('offset', timedelta(0))
562-
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
563-
564-
if 'calendar' in kwds:
565-
holidays = kwds['calendar'].holidays()
566-
else:
567-
holidays = kwds.get('holidays', [])
561+
calendar, holidays = self.get_calendar(weekmask=weekmask,
562+
holidays=holidays,
563+
calendar=calendar)
564+
# CustomBusinessDay instances are identified by the
565+
# following two attributes. See DateOffset._params()
566+
# holidays, weekmask
567+
568+
self.kwds['weekmask'] = self.weekmask = weekmask
569+
self.kwds['holidays'] = self.holidays = holidays
570+
self.kwds['calendar'] = self.calendar = calendar
571+
572+
def get_calendar(self, weekmask, holidays, calendar):
573+
'''Generate busdaycalendar'''
574+
if isinstance(calendar, np.busdaycalendar):
575+
if not holidays:
576+
holidays = tuple(calendar.holidays)
577+
elif not isinstance(holidays, tuple):
578+
holidays = tuple(holidays)
579+
else:
580+
# trust that calendar.holidays and holidays are
581+
# consistent
582+
pass
583+
return calendar, holidays
584+
585+
if holidays is None:
586+
holidays = []
587+
try:
588+
holidays = holidays + calendar.holidays().tolist()
589+
except AttributeError:
590+
pass
568591
holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in
569592
holidays]
570-
self.holidays = tuple(sorted(holidays))
571-
self.kwds['holidays'] = self.holidays
593+
holidays = tuple(sorted(holidays))
572594

573-
self._set_busdaycalendar()
595+
kwargs = {'weekmask': weekmask}
596+
if holidays:
597+
kwargs['holidays'] = holidays
574598

575-
def _set_busdaycalendar(self):
576-
if self.holidays:
577-
kwargs = {'weekmask':self.weekmask,'holidays':self.holidays}
578-
else:
579-
kwargs = {'weekmask':self.weekmask}
580599
try:
581-
self.busdaycalendar = np.busdaycalendar(**kwargs)
600+
busdaycalendar = np.busdaycalendar(**kwargs)
582601
except:
583602
# Check we have the required numpy version
584603
from distutils.version import LooseVersion
@@ -589,17 +608,23 @@ def _set_busdaycalendar(self):
589608
np.__version__)
590609
else:
591610
raise
611+
return busdaycalendar, holidays
592612

593613
def __getstate__(self):
594614
"""Return a pickleable state"""
595615
state = self.__dict__.copy()
596-
del state['busdaycalendar']
616+
del state['calendar']
597617
return state
598618

599619
def __setstate__(self, state):
600620
"""Reconstruct an instance from a pickled state"""
601621
self.__dict__ = state
602-
self._set_busdaycalendar()
622+
calendar, holidays = self.get_calendar(weekmask=self.weekmask,
623+
holidays=self.holidays,
624+
calendar=None)
625+
self.kwds['calendar'] = self.calendar = calendar
626+
self.kwds['holidays'] = self.holidays = holidays
627+
self.kwds['weekmask'] = state['weekmask']
603628

604629
@apply_wraps
605630
def apply(self, other):
@@ -613,7 +638,7 @@ def apply(self, other):
613638
np_dt = np.datetime64(date_in.date())
614639

615640
np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll,
616-
busdaycal=self.busdaycalendar)
641+
busdaycal=self.calendar)
617642

618643
dt_date = np_incr_dt.astype(datetime)
619644
result = datetime.combine(dt_date, date_in.time())
@@ -635,7 +660,6 @@ def _to_dt64(dt, dtype='datetime64'):
635660
# > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]')
636661
# numpy.datetime64('2013-05-01T02:00:00.000000+0200')
637662
# Thus astype is needed to cast datetime to datetime64[D]
638-
639663
if getattr(dt, 'tzinfo', None) is not None:
640664
i8 = tslib.pydt_to_i8(dt)
641665
dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo)
@@ -649,7 +673,7 @@ def onOffset(self, dt):
649673
if self.normalize and not _is_normalized(dt):
650674
return False
651675
day64 = self._to_dt64(dt,'datetime64[D]')
652-
return np.is_busday(day64, busdaycal=self.busdaycalendar)
676+
return np.is_busday(day64, busdaycal=self.calendar)
653677

654678

655679
class MonthOffset(SingleConstructorOffset):
@@ -767,7 +791,6 @@ def onOffset(self, dt):
767791
_prefix = 'BMS'
768792

769793

770-
771794
class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
772795
"""
773796
**EXPERIMENTAL** DateOffset of one custom business month
@@ -788,18 +811,22 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
788811
holidays : list
789812
list/array of dates to exclude from the set of valid business days,
790813
passed to ``numpy.busdaycalendar``
814+
calendar : pd.HolidayCalendar or np.busdaycalendar
791815
"""
792816

793817
_cacheable = False
794818
_prefix = 'CBM'
795-
def __init__(self, n=1, normalize=False, **kwds):
819+
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
820+
holidays=None, calendar=None, **kwds):
796821
self.n = int(n)
797822
self.normalize = normalize
798823
self.kwds = kwds
799824
self.offset = kwds.get('offset', timedelta(0))
800-
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
801-
self.cbday = CustomBusinessDay(n=self.n, **kwds)
802-
self.m_offset = MonthEnd()
825+
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize,
826+
weekmask=weekmask, holidays=holidays,
827+
calendar=calendar, **kwds)
828+
self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds)
829+
self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar
803830

804831
@apply_wraps
805832
def apply(self,other):
@@ -817,11 +844,11 @@ def apply(self,other):
817844
n -= 1
818845
elif other > cur_cmend and n <= -1:
819846
n += 1
820-
821-
new = cur_mend + n * MonthEnd()
847+
848+
new = cur_mend + n * self.m_offset
822849
result = self.cbday.rollback(new)
823850
return result
824-
851+
825852
class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
826853
"""
827854
**EXPERIMENTAL** DateOffset of one custom business month
@@ -842,18 +869,22 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
842869
holidays : list
843870
list/array of dates to exclude from the set of valid business days,
844871
passed to ``numpy.busdaycalendar``
872+
calendar : pd.HolidayCalendar or np.busdaycalendar
845873
"""
846874

847875
_cacheable = False
848876
_prefix = 'CBMS'
849-
def __init__(self, n=1, normalize=False, **kwds):
877+
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
878+
holidays=None, calendar=None, **kwds):
850879
self.n = int(n)
851880
self.normalize = normalize
852881
self.kwds = kwds
853882
self.offset = kwds.get('offset', timedelta(0))
854-
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
855-
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds)
856-
self.m_offset = MonthBegin(normalize=normalize)
883+
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize,
884+
weekmask=weekmask, holidays=holidays,
885+
calendar=calendar, **kwds)
886+
self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds)
887+
self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar
857888

858889
@apply_wraps
859890
def apply(self,other):
@@ -872,8 +903,8 @@ def apply(self,other):
872903
n += 1
873904
elif dt_in < cur_cmbegin and n >= 1:
874905
n -= 1
875-
876-
new = cur_mbegin + n * MonthBegin()
906+
907+
new = cur_mbegin + n * self.m_offset
877908
result = self.cbday.rollforward(new)
878909
return result
879910

492 Bytes
Binary file not shown.

pandas/tseries/tests/test_offsets.py

+40-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from datetime import date, datetime, timedelta
23
from dateutil.relativedelta import relativedelta
34
from pandas.compat import range
@@ -22,6 +23,7 @@
2223
from pandas.tseries.tools import parse_time_string
2324
import pandas.tseries.offsets as offsets
2425

26+
from pandas.io.pickle import read_pickle
2527
from pandas.tslib import NaT, Timestamp
2628
import pandas.tslib as tslib
2729
from pandas.util.testing import assertRaisesRegexp
@@ -848,6 +850,24 @@ def test_calendar(self):
848850
dt = datetime(2014, 1, 17)
849851
assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21))
850852

853+
def test_roundtrip_pickle(self):
854+
def _check_roundtrip(obj):
855+
unpickled = self.round_trip_pickle(obj)
856+
self.assertEqual(unpickled, obj)
857+
_check_roundtrip(self.offset)
858+
_check_roundtrip(self.offset2)
859+
_check_roundtrip(self.offset*2)
860+
861+
def test_pickle_compat_0_14_1(self):
862+
hdays = [datetime(2013,1,1) for ele in range(4)]
863+
864+
pth = tm.get_data_path()
865+
866+
cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle'))
867+
cday = CDay(holidays=hdays)
868+
self.assertEqual(cday, cday0_14_1)
869+
870+
851871
class CustomBusinessMonthBase(object):
852872
_multiprocess_can_split_ = True
853873

@@ -894,6 +914,15 @@ def test_offsets_compare_equal(self):
894914
offset2 = self._object()
895915
self.assertFalse(offset1 != offset2)
896916

917+
def test_roundtrip_pickle(self):
918+
def _check_roundtrip(obj):
919+
unpickled = self.round_trip_pickle(obj)
920+
self.assertEqual(unpickled, obj)
921+
_check_roundtrip(self._object())
922+
_check_roundtrip(self._object(2))
923+
_check_roundtrip(self._object()*2)
924+
925+
897926
class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base):
898927
_object = CBMonthEnd
899928

@@ -1006,8 +1035,12 @@ def test_holidays(self):
10061035

10071036
def test_datetimeindex(self):
10081037
from pandas.tseries.holiday import USFederalHolidayCalendar
1009-
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0],
1010-
datetime(2012,1,31))
1038+
hcal = USFederalHolidayCalendar()
1039+
freq = CBMonthEnd(calendar=hcal)
1040+
1041+
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',
1042+
freq=freq).tolist()[0],
1043+
datetime(2012,1,31))
10111044

10121045
class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base):
10131046
_object = CBMonthBegin
@@ -1120,8 +1153,11 @@ def test_holidays(self):
11201153
self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3))
11211154

11221155
def test_datetimeindex(self):
1123-
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0],
1124-
datetime(2012,1,3))
1156+
hcal = USFederalHolidayCalendar()
1157+
cbmb = CBMonthBegin(calendar=hcal)
1158+
self.assertEqual(DatetimeIndex(start='20120101', end='20130101',
1159+
freq=cbmb).tolist()[0],
1160+
datetime(2012,1,3))
11251161

11261162

11271163
def assertOnOffset(offset, date, expected):

vb_suite/timeseries.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -285,15 +285,20 @@ def date_range(start=None, end=None, periods=None, freq=None):
285285
setup = common_setup + """
286286
import datetime as dt
287287
import pandas as pd
288+
import pandas.tseries.holiday
288289
import numpy as np
289290
290291
date = dt.datetime(2011,1,1)
291292
dt64 = np.datetime64('2011-01-01 09:00Z')
293+
hcal = pd.tseries.holiday.USFederalHolidayCalendar()
292294
293295
day = pd.offsets.Day()
294296
year = pd.offsets.YearBegin()
295297
cday = pd.offsets.CustomBusinessDay()
296-
cme = pd.offsets.CustomBusinessMonthEnd()
298+
cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal)
299+
cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal)
300+
301+
cdayh = pd.offsets.CustomBusinessDay(calendar=hcal)
297302
"""
298303
timeseries_day_incr = Benchmark("date + day",setup)
299304

@@ -306,15 +311,26 @@ def date_range(start=None, end=None, periods=None, freq=None):
306311
timeseries_custom_bday_incr = \
307312
Benchmark("date + cday",setup)
308313

314+
timeseries_custom_bday_decr = \
315+
Benchmark("date - cday",setup)
316+
309317
timeseries_custom_bday_apply = \
310318
Benchmark("cday.apply(date)",setup)
311319

312320
timeseries_custom_bday_apply_dt64 = \
313321
Benchmark("cday.apply(dt64)",setup)
314322

315-
# Increment by n
316-
timeseries_custom_bday_incr_n = \
317-
Benchmark("date + 10 * cday",setup)
323+
timeseries_custom_bday_cal_incr = \
324+
Benchmark("date + 1 * cdayh",setup)
325+
326+
timeseries_custom_bday_cal_decr = \
327+
Benchmark("date - 1 * cdayh",setup)
328+
329+
timeseries_custom_bday_cal_incr_n = \
330+
Benchmark("date + 10 * cdayh",setup)
331+
332+
timeseries_custom_bday_cal_incr_neg_n = \
333+
Benchmark("date - 10 * cdayh",setup)
318334

319335
# Increment custom business month
320336
timeseries_custom_bmonthend_incr = \
@@ -323,6 +339,16 @@ def date_range(start=None, end=None, periods=None, freq=None):
323339
timeseries_custom_bmonthend_incr_n = \
324340
Benchmark("date + 10 * cme",setup)
325341

342+
timeseries_custom_bmonthend_decr_n = \
343+
Benchmark("date - 10 * cme",setup)
344+
345+
timeseries_custom_bmonthbegin_incr_n = \
346+
Benchmark("date + 10 * cmb",setup)
347+
348+
timeseries_custom_bmonthbegin_decr_n = \
349+
Benchmark("date - 10 * cmb",setup)
350+
351+
326352
#----------------------------------------------------------------------
327353
# month/quarter/year start/end accessors
328354

@@ -357,4 +383,3 @@ def iter_n(iterable, n=None):
357383
timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup)
358384

359385
timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup)
360-

0 commit comments

Comments
 (0)