Skip to content

Commit a32674b

Browse files
sinhrksjreback
authored andcommitted
CLN: Move boxing logic to BlockManager
CLN: clean up quantile & move to BlockManager closes pandas-dev#12741 closes pandas-dev#12772 closes pandas-dev#12469 closes pandas-dev#12752
1 parent 8776596 commit a32674b

File tree

13 files changed

+942
-627
lines changed

13 files changed

+942
-627
lines changed

doc/source/whatsnew/v0.18.1.txt

+6-2
Original file line numberDiff line numberDiff line change
@@ -222,11 +222,15 @@ Bug Fixes
222222
- Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`)
223223

224224

225-
226-
227225
- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
228226

229227
- Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`)
230228
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
231229
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
232230
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
231+
232+
233+
234+
235+
- Bug in ``.quantile`` with interpolation may coerce to ``float`` unexpectedly (:issue:`12772`)
236+
- Bug in ``.quantile`` with empty Series may return scalar rather than empty Series (:issue:`12772`)

pandas/core/common.py

-10
Original file line numberDiff line numberDiff line change
@@ -2377,16 +2377,6 @@ def needs_i8_conversion(arr_or_dtype):
23772377
is_datetime64tz_dtype(arr_or_dtype))
23782378

23792379

2380-
def i8_boxer(arr_or_dtype):
2381-
""" return the scalar boxer for the dtype """
2382-
if (is_datetime64_dtype(arr_or_dtype) or
2383-
is_datetime64tz_dtype(arr_or_dtype)):
2384-
return lib.Timestamp
2385-
elif is_timedelta64_dtype(arr_or_dtype):
2386-
return lambda x: lib.Timedelta(x, unit='ns')
2387-
raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype))
2388-
2389-
23902380
def is_numeric_dtype(arr_or_dtype):
23912381
tipo = _get_dtype_type(arr_or_dtype)
23922382
return (issubclass(tipo, (np.number, np.bool_)) and

pandas/core/frame.py

+9-43
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
from pandas.core.categorical import Categorical
4141
import pandas.computation.expressions as expressions
4242
from pandas.computation.eval import eval as _eval
43-
from numpy import percentile as _quantile
4443
from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
4544
OrderedDict, raise_with_traceback)
4645
from pandas import compat
@@ -63,7 +62,6 @@
6362
import pandas.algos as _algos
6463

6564
from pandas.core.config import get_option
66-
from pandas import _np_version_under1p9
6765

6866
# ---------------------------------------------------------------------
6967
# Docstring templates
@@ -4227,10 +4225,7 @@ def applymap(self, func):
42274225

42284226
# if we have a dtype == 'M8[ns]', provide boxed values
42294227
def infer(x):
4230-
if com.needs_i8_conversion(x):
4231-
f = com.i8_boxer(x)
4232-
x = lib.map_infer(_values_from_object(x), f)
4233-
return lib.map_infer(_values_from_object(x), func)
4228+
return lib.map_infer(x.asobject, func)
42344229

42354230
return self.apply(infer)
42364231

@@ -4974,55 +4969,26 @@ def quantile(self, q=0.5, axis=0, numeric_only=True,
49744969
0.1 1.3 3.7
49754970
0.5 2.5 55.0
49764971
"""
4977-
49784972
self._check_percentile(q)
4979-
per = np.asarray(q) * 100
4980-
4981-
if not com.is_list_like(per):
4982-
per = [per]
4973+
if not com.is_list_like(q):
49834974
q = [q]
49844975
squeeze = True
49854976
else:
49864977
squeeze = False
49874978

4988-
if _np_version_under1p9:
4989-
if interpolation != 'linear':
4990-
raise ValueError("Interpolation methods other than linear "
4991-
"are not supported in numpy < 1.9")
4992-
4993-
def f(arr, per, interpolation):
4994-
if arr._is_datelike_mixed_type:
4995-
values = _values_from_object(arr).view('i8')
4996-
else:
4997-
values = arr.astype(float)
4998-
values = values[notnull(values)]
4999-
if len(values) == 0:
5000-
return NA
5001-
else:
5002-
if _np_version_under1p9:
5003-
return _quantile(values, per)
5004-
else:
5005-
return _quantile(values, per, interpolation=interpolation)
5006-
50074979
data = self._get_numeric_data() if numeric_only else self
5008-
50094980
axis = self._get_axis_number(axis)
50104981

4982+
def _quantile(series):
4983+
res = series.quantile(q, interpolation=interpolation)
4984+
return series.name, res
4985+
50114986
if axis == 1:
50124987
data = data.T
50134988

5014-
# need to know which cols are timestamp going in so that we can
5015-
# map timestamp over them after getting the quantile.
5016-
is_dt_col = data.dtypes.map(com.is_datetime64_dtype)
5017-
is_dt_col = is_dt_col[is_dt_col].index
5018-
5019-
quantiles = [[f(vals, x, interpolation) for x in per]
5020-
for (_, vals) in data.iteritems()]
5021-
5022-
result = self._constructor(quantiles, index=data._info_axis,
5023-
columns=q).T
5024-
if len(is_dt_col) > 0:
5025-
result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp)
4989+
# unable to use DataFrame.apply, becasuse data may be empty
4990+
result = dict(_quantile(s) for (_, s) in data.iteritems())
4991+
result = self._constructor(result, columns=data.columns)
50264992
if squeeze:
50274993
if result.shape == (1, 1):
50284994
result = result.T.iloc[:, 0] # don't want scalar

pandas/core/internals.py

+86-35
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from collections import defaultdict
77

88
import numpy as np
9+
from numpy import percentile as _quantile
10+
911
from pandas.core.base import PandasObject
1012

1113
from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE,
@@ -131,6 +133,8 @@ def get_values(self, dtype=None):
131133
return an internal format, currently just the ndarray
132134
this is often overriden to handle to_dense like operations
133135
"""
136+
if com.is_object_dtype(dtype):
137+
return self.values.astype(object)
134138
return self.values
135139

136140
def to_dense(self):
@@ -141,6 +145,10 @@ def to_object_block(self, mgr):
141145
values = self.get_values(dtype=object)
142146
return self.make_block(values, klass=ObjectBlock)
143147

148+
@property
149+
def _na_value(self):
150+
return np.nan
151+
144152
@property
145153
def fill_value(self):
146154
return np.nan
@@ -1247,6 +1255,19 @@ def equals(self, other):
12471255
return False
12481256
return array_equivalent(self.values, other.values)
12491257

1258+
def quantile(self, values, qs, **kwargs):
1259+
if len(values) == 0:
1260+
if com.is_list_like(qs):
1261+
return np.array([self.fill_value])
1262+
else:
1263+
return self._na_value
1264+
1265+
if com.is_list_like(qs):
1266+
values = [_quantile(values, x * 100, **kwargs) for x in qs]
1267+
return np.array(values)
1268+
else:
1269+
return _quantile(values, qs * 100, **kwargs)
1270+
12501271

12511272
class NonConsolidatableMixIn(object):
12521273
""" hold methods for the nonconsolidatable blocks """
@@ -1455,15 +1476,55 @@ def should_store(self, value):
14551476
return com.is_integer_dtype(value) and value.dtype == self.dtype
14561477

14571478

1458-
class TimeDeltaBlock(IntBlock):
1479+
class DatetimeLikeBlockMixin(object):
1480+
1481+
@property
1482+
def _na_value(self):
1483+
return tslib.NaT
1484+
1485+
@property
1486+
def fill_value(self):
1487+
return tslib.iNaT
1488+
1489+
def _try_operate(self, values):
1490+
""" return a version to operate on """
1491+
return values.view('i8')
1492+
1493+
def get_values(self, dtype=None):
1494+
"""
1495+
return object dtype as boxed values, such as Timestamps/Timedelta
1496+
"""
1497+
if com.is_object_dtype(dtype):
1498+
return lib.map_infer(self.values.ravel(),
1499+
self._box_func).reshape(self.values.shape)
1500+
return self.values
1501+
1502+
def quantile(self, values, qs, **kwargs):
1503+
values = values.view('i8')
1504+
mask = values == self.fill_value
1505+
if mask.any():
1506+
values = values[~mask]
1507+
result = Block.quantile(self, values, qs, **kwargs)
1508+
1509+
if com.is_datetime64tz_dtype(self):
1510+
# ToDo: Temp logic to avoid GH 12619 and GH 12772
1511+
# which affects to DatetimeBlockTZ_try_coerce_result for np.ndarray
1512+
if isinstance(result, np.ndarray) and values.ndim > 0:
1513+
result = self._holder(result, tz='UTC')
1514+
result = result.tz_convert(self.values.tz)
1515+
return result
1516+
return self._try_coerce_result(result)
1517+
1518+
1519+
class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
14591520
__slots__ = ()
14601521
is_timedelta = True
14611522
_can_hold_na = True
14621523
is_numeric = False
14631524

14641525
@property
1465-
def fill_value(self):
1466-
return tslib.iNaT
1526+
def _box_func(self):
1527+
return lambda x: tslib.Timedelta(x, unit='ns')
14671528

14681529
def fillna(self, value, **kwargs):
14691530

@@ -1516,19 +1577,15 @@ def _try_coerce_args(self, values, other):
15161577

15171578
return values, values_mask, other, other_mask
15181579

1519-
def _try_operate(self, values):
1520-
""" return a version to operate on """
1521-
return values.view('i8')
1522-
15231580
def _try_coerce_result(self, result):
15241581
""" reverse of try_coerce_args / try_operate """
15251582
if isinstance(result, np.ndarray):
15261583
mask = isnull(result)
15271584
if result.dtype.kind in ['i', 'f', 'O']:
15281585
result = result.astype('m8[ns]')
15291586
result[mask] = tslib.iNaT
1530-
elif isinstance(result, np.integer):
1531-
result = lib.Timedelta(result)
1587+
elif isinstance(result, (np.integer, np.float)):
1588+
result = self._box_func(result)
15321589
return result
15331590

15341591
def should_store(self, value):
@@ -1558,13 +1615,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None,
15581615
dtype=object)
15591616
return rvalues
15601617

1561-
def get_values(self, dtype=None):
1562-
# return object dtypes as Timedelta
1563-
if dtype == object:
1564-
return lib.map_infer(self.values.ravel(),
1565-
lib.Timedelta).reshape(self.values.shape)
1566-
return self.values
1567-
15681618

15691619
class BoolBlock(NumericBlock):
15701620
__slots__ = ()
@@ -1965,7 +2015,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
19652015
return values.reshape(1, len(values))
19662016

19672017

1968-
class DatetimeBlock(Block):
2018+
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
19692019
__slots__ = ()
19702020
is_datetime = True
19712021
_can_hold_na = True
@@ -2009,10 +2059,6 @@ def _try_cast(self, element):
20092059
except:
20102060
return element
20112061

2012-
def _try_operate(self, values):
2013-
""" return a version to operate on """
2014-
return values.view('i8')
2015-
20162062
def _try_coerce_args(self, values, other):
20172063
"""
20182064
Coerce values and other to dtype 'i8'. NaN and NaT convert to
@@ -2040,7 +2086,7 @@ def _try_coerce_args(self, values, other):
20402086
other = tslib.iNaT
20412087
other_mask = True
20422088
elif isinstance(other, (datetime, np.datetime64, date)):
2043-
other = lib.Timestamp(other)
2089+
other = self._box_func(other)
20442090
if getattr(other, 'tz') is not None:
20452091
raise TypeError("cannot coerce a Timestamp with a tz on a "
20462092
"naive Block")
@@ -2067,13 +2113,13 @@ def _try_coerce_result(self, result):
20672113
if isinstance(result, np.ndarray):
20682114
if result.dtype.kind in ['i', 'f', 'O']:
20692115
result = result.astype('M8[ns]')
2070-
elif isinstance(result, (np.integer, np.datetime64)):
2071-
result = lib.Timestamp(result)
2116+
elif isinstance(result, (np.integer, np.float, np.datetime64)):
2117+
result = self._box_func(result)
20722118
return result
20732119

20742120
@property
2075-
def fill_value(self):
2076-
return tslib.iNaT
2121+
def _box_func(self):
2122+
return tslib.Timestamp
20772123

20782124
def to_native_types(self, slicer=None, na_rep=None, date_format=None,
20792125
quoting=None, **kwargs):
@@ -2109,13 +2155,6 @@ def set(self, locs, values, check=False):
21092155

21102156
self.values[locs] = values
21112157

2112-
def get_values(self, dtype=None):
2113-
# return object dtype as Timestamps
2114-
if dtype == object:
2115-
return lib.map_infer(
2116-
self.values.ravel(), lib.Timestamp).reshape(self.values.shape)
2117-
return self.values
2118-
21192158

21202159
class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
21212160
""" implement a datetime64 block with a tz attribute """
@@ -2156,7 +2195,7 @@ def external_values(self):
21562195

21572196
def get_values(self, dtype=None):
21582197
# return object dtype as Timestamps with the zones
2159-
if dtype == object:
2198+
if com.is_object_dtype(dtype):
21602199
f = lambda x: lib.Timestamp(x, tz=self.values.tz)
21612200
return lib.map_infer(
21622201
self.values.ravel(), f).reshape(self.values.shape)
@@ -2239,10 +2278,14 @@ def _try_coerce_result(self, result):
22392278

22402279
if isinstance(result, np.ndarray):
22412280
result = self._holder(result, tz=self.values.tz)
2242-
elif isinstance(result, (np.integer, np.datetime64)):
2281+
elif isinstance(result, (np.integer, np.float, np.datetime64)):
22432282
result = lib.Timestamp(result, tz=self.values.tz)
22442283
return result
22452284

2285+
@property
2286+
def _box_func(self):
2287+
return lambda x: tslib.Timestamp(x, tz=self.dtype.tz)
2288+
22462289
def shift(self, periods, axis=0, mgr=None):
22472290
""" shift the block by periods """
22482291

@@ -3863,6 +3906,14 @@ def get_values(self):
38633906
""" return a dense type view """
38643907
return np.array(self._block.to_dense(), copy=False)
38653908

3909+
@property
3910+
def asobject(self):
3911+
"""
3912+
return a object dtype array. datetime/timedelta like values are boxed
3913+
to Timestamp/Timedelta instances.
3914+
"""
3915+
return self._block.get_values(dtype=object)
3916+
38663917
@property
38673918
def itemsize(self):
38683919
return self._block.values.itemsize

0 commit comments

Comments
 (0)