Skip to content

Commit 027689e

Browse files
committed
BUG: Series.map may raise TypeError in Categorical or DatetimeTz
1 parent 3bed097 commit 027689e

File tree

13 files changed

+238
-30
lines changed

13 files changed

+238
-30
lines changed

Diff for: doc/source/whatsnew/v0.18.1.txt

+5
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ API changes
129129

130130
- ``Period`` and ``PeriodIndex`` now raises ``IncompatibleFrequency`` error which inherits ``ValueError`` rather than raw ``ValueError`` (:issue:`12615`)
131131

132+
- ``Series.apply`` for category dtype now applies passed function to each ``.categories`` (not ``.codes``), and returns "category" dtype if possible (:issue:`12473`)
132133

133134

134135
- The default for ``.query()/.eval()`` is now ``engine=None``, which will use ``numexpr`` if it's installed; otherwise it will fallback to the ``python`` engine. This mimics the pre-0.18.1 behavior if ``numexpr`` is installed (and which Previously, if numexpr was not installed, ``.query()/.eval()`` would raise). (:issue:`12749`)
@@ -324,4 +325,8 @@ Bug Fixes
324325
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
325326
- ``pd.read_excel()`` now accepts column names associated with keyword argument ``names``(:issue `12870`)
326327

328+
327329
- Bug in ``fill_value`` is ignored if the argument to a binary operator is a constant (:issue `12723`)
330+
331+
332+
- Bug in ``Series.map`` raises ``TypeError`` if its dtype is ``category`` or tz-aware ``datetime`` (:issue:`12473`)

Diff for: pandas/core/categorical.py

+24
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,30 @@ def remove_unused_categories(self, inplace=False):
883883
if not inplace:
884884
return cat
885885

886+
def map(self, mapper):
887+
"""
888+
Apply mapper function to its categories (not codes).
889+
890+
Parameters
891+
----------
892+
mapper : callable
893+
Function to be applied. When all categories are mapped
894+
to different categories, the result will be Categorical which has
895+
the same order property as the original. Otherwise, the result will
896+
be np.ndarray.
897+
898+
Returns
899+
-------
900+
applied : Categorical or np.ndarray.
901+
"""
902+
new_categories = self.categories.map(mapper)
903+
try:
904+
return Categorical.from_codes(self._codes.copy(),
905+
categories=new_categories,
906+
ordered=self.ordered)
907+
except ValueError:
908+
return np.take(new_categories, self._codes)
909+
886910
__eq__ = _cat_compare_op('__eq__')
887911
__ne__ = _cat_compare_op('__ne__')
888912
__lt__ = _cat_compare_op('__lt__')

Diff for: pandas/core/common.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,7 @@ def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
705705
copy : if True always make a copy even if no upcast is required
706706
"""
707707

708-
if is_internal_type(values):
708+
if is_extension_type(values):
709709
if copy:
710710
values = values.copy()
711711
else:
@@ -1714,7 +1714,7 @@ def is_datetimetz(array):
17141714
is_datetime64tz_dtype(array))
17151715

17161716

1717-
def is_internal_type(value):
1717+
def is_extension_type(value):
17181718
"""
17191719
if we are a klass that is preserved by the internals
17201720
these are internal klasses that we represent (and don't use a np.array)

Diff for: pandas/core/frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast,
2828
is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like,
2929
_maybe_box_datetimelike, is_categorical_dtype, is_object_dtype,
30-
is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike,
30+
is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike,
3131
_dict_compat)
3232
from pandas.core.generic import NDFrame, _shared_docs
3333
from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -2594,7 +2594,7 @@ def reindexer(value):
25942594
value = com._possibly_cast_to_datetime(value, dtype)
25952595

25962596
# return internal types directly
2597-
if is_internal_type(value):
2597+
if is_extension_type(value):
25982598
return value
25992599

26002600
# broadcast across multiple columns if necessary
@@ -4094,7 +4094,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
40944094

40954095
# we cannot reduce using non-numpy dtypes,
40964096
# as demonstrated in gh-12244
4097-
if not is_internal_type(values):
4097+
if not is_extension_type(values):
40984098
# Create a dummy Series from an empty array
40994099
index = self._get_axis(axis)
41004100
empty_arr = np.empty(len(index), dtype=values.dtype)

Diff for: pandas/core/internals.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
_maybe_convert_string_to_object,
2121
_maybe_convert_scalar,
2222
is_categorical, is_datetimelike_v_numeric,
23-
is_numeric_v_string_like, is_internal_type)
23+
is_numeric_v_string_like, is_extension_type)
2424
import pandas.core.algorithms as algos
2525
from pandas.types.api import DatetimeTZDtype
2626

@@ -1765,7 +1765,7 @@ def should_store(self, value):
17651765
return not (issubclass(value.dtype.type,
17661766
(np.integer, np.floating, np.complexfloating,
17671767
np.datetime64, np.bool_)) or
1768-
is_internal_type(value))
1768+
is_extension_type(value))
17691769

17701770
def replace(self, to_replace, value, inplace=False, filter=None,
17711771
regex=False, convert=True, mgr=None):
@@ -3388,10 +3388,10 @@ def set(self, item, value, check=False):
33883388
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
33893389
# can prob also fix the various if tests for sparse/categorical
33903390

3391-
value_is_internal_type = is_internal_type(value)
3391+
value_is_extension_type = is_extension_type(value)
33923392

33933393
# categorical/spares/datetimetz
3394-
if value_is_internal_type:
3394+
if value_is_extension_type:
33953395

33963396
def value_getitem(placement):
33973397
return value
@@ -3463,7 +3463,7 @@ def value_getitem(placement):
34633463
unfit_count = len(unfit_mgr_locs)
34643464

34653465
new_blocks = []
3466-
if value_is_internal_type:
3466+
if value_is_extension_type:
34673467
# This code (ab-)uses the fact that sparse blocks contain only
34683468
# one item.
34693469
new_blocks.extend(

Diff for: pandas/core/series.py

+25-15
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
is_categorical_dtype,
2121
_possibly_cast_to_datetime,
2222
_possibly_castable, _possibly_convert_platform,
23-
_try_sort, is_internal_type, is_datetimetz,
23+
_try_sort, is_extension_type, is_datetimetz,
2424
_maybe_match_name, ABCSparseArray,
2525
_coerce_to_dtype, SettingWithCopyError,
2626
_maybe_box_datetimelike, ABCDataFrame,
@@ -2063,28 +2063,33 @@ def map(self, arg, na_action=None):
20632063
y : Series
20642064
same index as caller
20652065
"""
2066-
values = self.asobject
20672066

2068-
if na_action == 'ignore':
2069-
mask = isnull(values)
2070-
2071-
def map_f(values, f):
2072-
return lib.map_infer_mask(values, f, mask.view(np.uint8))
2067+
if is_extension_type(self.dtype):
2068+
values = self._values
2069+
if na_action is not None:
2070+
raise NotImplementedError
2071+
map_f = lambda values, f: values.map(f)
20732072
else:
2074-
map_f = lib.map_infer
2073+
values = self.asobject
2074+
2075+
if na_action == 'ignore':
2076+
def map_f(values, f):
2077+
return lib.map_infer_mask(values, f,
2078+
isnull(values).view(np.uint8))
2079+
else:
2080+
map_f = lib.map_infer
20752081

20762082
if isinstance(arg, (dict, Series)):
20772083
if isinstance(arg, dict):
20782084
arg = self._constructor(arg, index=arg.keys())
20792085

20802086
indexer = arg.index.get_indexer(values)
20812087
new_values = algos.take_1d(arg._values, indexer)
2082-
return self._constructor(new_values,
2083-
index=self.index).__finalize__(self)
20842088
else:
2085-
mapped = map_f(values, arg)
2086-
return self._constructor(mapped,
2087-
index=self.index).__finalize__(self)
2089+
new_values = map_f(values, arg)
2090+
2091+
return self._constructor(new_values,
2092+
index=self.index).__finalize__(self)
20882093

20892094
def apply(self, func, convert_dtype=True, args=(), **kwds):
20902095
"""
@@ -2193,7 +2198,12 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21932198
if isinstance(f, np.ufunc):
21942199
return f(self)
21952200

2196-
mapped = lib.map_infer(self.asobject, f, convert=convert_dtype)
2201+
if is_extension_type(self.dtype):
2202+
mapped = self._values.map(f)
2203+
else:
2204+
values = self.asobject
2205+
mapped = lib.map_infer(values, f, convert=convert_dtype)
2206+
21972207
if len(mapped) and isinstance(mapped[0], Series):
21982208
from pandas.core.frame import DataFrame
21992209
return DataFrame(mapped.tolist(), index=self.index)
@@ -2779,7 +2789,7 @@ def _try_cast(arr, take_fast_path):
27792789

27802790
try:
27812791
subarr = _possibly_cast_to_datetime(arr, dtype)
2782-
if not is_internal_type(subarr):
2792+
if not is_extension_type(subarr):
27832793
subarr = np.array(subarr, dtype=dtype, copy=copy)
27842794
except (ValueError, TypeError):
27852795
if is_categorical_dtype(dtype):

Diff for: pandas/indexes/base.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -2194,11 +2194,22 @@ def groupby(self, to_groupby):
21942194
-------
21952195
groups : dict
21962196
{group name -> group labels}
2197-
21982197
"""
21992198
return self._groupby(self.values, _values_from_object(to_groupby))
22002199

22012200
def map(self, mapper):
2201+
"""
2202+
Apply mapper function to its values.
2203+
2204+
Parameters
2205+
----------
2206+
mapper : callable
2207+
Function to be applied.
2208+
2209+
Returns
2210+
-------
2211+
applied : array
2212+
"""
22022213
return self._arrmap(self.values, mapper)
22032214

22042215
def isin(self, values, level=None):

Diff for: pandas/indexes/category.py

+18
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,24 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
468468
na_value=-1)
469469
return self._create_from_codes(taken)
470470

471+
def map(self, mapper):
472+
"""
473+
Apply mapper function to its categories (not codes).
474+
475+
Parameters
476+
----------
477+
mapper : callable
478+
Function to be applied. When all categories are mapped
479+
to different categories, the result will be Categorical which has
480+
the same order property as the original. Otherwise, the result will
481+
be np.ndarray.
482+
483+
Returns
484+
-------
485+
applied : Categorical or np.ndarray.
486+
"""
487+
return self.values.map(mapper)
488+
471489
def delete(self, loc):
472490
"""
473491
Make new Index with passed location(-s) deleted

Diff for: pandas/tests/indexes/test_category.py

+27
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,33 @@ def test_min_max(self):
201201
self.assertEqual(ci.min(), 'c')
202202
self.assertEqual(ci.max(), 'b')
203203

204+
def test_map(self):
205+
ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'),
206+
ordered=True)
207+
result = ci.map(lambda x: x.lower())
208+
exp = pd.Categorical(list('ababc'), categories=list('cba'),
209+
ordered=True)
210+
tm.assert_categorical_equal(result, exp)
211+
212+
ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
213+
ordered=False, name='XXX')
214+
result = ci.map(lambda x: x.lower())
215+
exp = pd.Categorical(list('ababc'), categories=list('bac'),
216+
ordered=False)
217+
tm.assert_categorical_equal(result, exp)
218+
219+
tm.assert_numpy_array_equal(ci.map(lambda x: 1), np.array([1] * 5))
220+
221+
# change categories dtype
222+
ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
223+
ordered=False)
224+
def f(x):
225+
return {'A': 10, 'B': 20, 'C': 30}.get(x)
226+
result = ci.map(f)
227+
exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30],
228+
ordered=False)
229+
tm.assert_categorical_equal(result, exp)
230+
204231
def test_append(self):
205232

206233
ci = self.create_index()

Diff for: pandas/tests/series/test_analytics.py

+19
Original file line numberDiff line numberDiff line change
@@ -1567,6 +1567,25 @@ def test_sortlevel(self):
15671567
res = s.sortlevel(['A', 'B'], sort_remaining=False)
15681568
assert_series_equal(s, res)
15691569

1570+
def test_apply_categorical(self):
1571+
values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
1572+
ordered=True)
1573+
s = pd.Series(values, name='XX', index=list('abcdefg'))
1574+
result = s.apply(lambda x: x.lower())
1575+
1576+
# should be categorical dtype when the number of categories are
1577+
# the same
1578+
values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
1579+
ordered=True)
1580+
exp = pd.Series(values, name='XX', index=list('abcdefg'))
1581+
tm.assert_series_equal(result, exp)
1582+
tm.assert_categorical_equal(result.values, exp.values)
1583+
1584+
result = s.apply(lambda x: 'A')
1585+
exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
1586+
tm.assert_series_equal(result, exp)
1587+
self.assertEqual(result.dtype, np.object)
1588+
15701589
def test_shift_int(self):
15711590
ts = self.ts.astype(int)
15721591
shifted = ts.shift(1)

0 commit comments

Comments
 (0)