diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index c6642c5216262..5acf880a54c58 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -129,6 +129,7 @@ API changes - ``Period`` and ``PeriodIndex`` now raises ``IncompatibleFrequency`` error which inherits ``ValueError`` rather than raw ``ValueError`` (:issue:`12615`) +- ``Series.apply`` for category dtype now applies passed function to each ``.categories`` (not ``.codes``), and returns "category" dtype if possible (:issue:`12473`) - The default for ``.query()/.eval()`` is now ``engine=None``, which will use ``numexpr`` if it's installed; otherwise it will fallback to the ``python`` engine. This mimics the pre-0.18.1 behavior if ``numexpr`` is installed (and which Previously, if numexpr was not installed, ``.query()/.eval()`` would raise). (:issue:`12749`) @@ -324,4 +325,8 @@ Bug Fixes - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`) - ``pd.read_excel()`` now accepts column names associated with keyword argument ``names``(:issue `12870`) + - Bug in ``fill_value`` is ignored if the argument to a binary operator is a constant (:issue `12723`) + + +- Bug in ``Series.map`` raises ``TypeError`` if its dtype is ``category`` or tz-aware ``datetime`` (:issue:`12473`) \ No newline at end of file diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 986f7ad55361a..863d68a7c60e5 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -883,6 +883,30 @@ def remove_unused_categories(self, inplace=False): if not inplace: return cat + def map(self, mapper): + """ + Apply mapper function to its categories (not codes). + + Parameters + ---------- + mapper : callable + Function to be applied. When all categories are mapped + to different categories, the result will be Categorical which has + the same order property as the original. Otherwise, the result will + be np.ndarray. + + Returns + ------- + applied : Categorical or np.ndarray. + """ + new_categories = self.categories.map(mapper) + try: + return Categorical.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) + except ValueError: + return np.take(new_categories, self._codes) + __eq__ = _cat_compare_op('__eq__') __ne__ = _cat_compare_op('__ne__') __lt__ = _cat_compare_op('__lt__') diff --git a/pandas/core/common.py b/pandas/core/common.py index c0f47a48a46a8..f75b1bbce668f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -705,7 +705,7 @@ def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): copy : if True always make a copy even if no upcast is required """ - if is_internal_type(values): + if is_extension_type(values): if copy: values = values.copy() else: @@ -1714,7 +1714,7 @@ def is_datetimetz(array): is_datetime64tz_dtype(array)) -def is_internal_type(value): +def is_extension_type(value): """ if we are a klass that is preserved by the internals these are internal klasses that we represent (and don't use a np.array) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96a2b87a1bdb7..c598a2b719f82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,7 +27,7 @@ isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, - is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike, + is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike, _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index @@ -2594,7 +2594,7 @@ def reindexer(value): value = com._possibly_cast_to_datetime(value, dtype) # return internal types directly - if is_internal_type(value): + if is_extension_type(value): return value # broadcast across multiple columns if necessary @@ -4094,7 +4094,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 - if not is_internal_type(values): + if not is_extension_type(values): # Create a dummy Series from an empty array index = self._get_axis(axis) empty_arr = np.empty(len(index), dtype=values.dtype) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 463a2da529b5d..a74d2fb45cdbc 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -20,7 +20,7 @@ _maybe_convert_string_to_object, _maybe_convert_scalar, is_categorical, is_datetimelike_v_numeric, - is_numeric_v_string_like, is_internal_type) + is_numeric_v_string_like, is_extension_type) import pandas.core.algorithms as algos from pandas.types.api import DatetimeTZDtype @@ -1765,7 +1765,7 @@ def should_store(self, value): return not (issubclass(value.dtype.type, (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_)) or - is_internal_type(value)) + is_extension_type(value)) def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): @@ -3388,10 +3388,10 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_internal_type = is_internal_type(value) + value_is_extension_type = is_extension_type(value) # categorical/spares/datetimetz - if value_is_internal_type: + if value_is_extension_type: def value_getitem(placement): return value @@ -3463,7 +3463,7 @@ def value_getitem(placement): unfit_count = len(unfit_mgr_locs) new_blocks = [] - if value_is_internal_type: + if value_is_extension_type: # This code (ab-)uses the fact that sparse blocks contain only # one item. new_blocks.extend( diff --git a/pandas/core/series.py b/pandas/core/series.py index bf20c5d740133..9fc1bc0dbe969 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -20,7 +20,7 @@ is_categorical_dtype, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, - _try_sort, is_internal_type, is_datetimetz, + _try_sort, is_extension_type, is_datetimetz, _maybe_match_name, ABCSparseArray, _coerce_to_dtype, SettingWithCopyError, _maybe_box_datetimelike, ABCDataFrame, @@ -2063,15 +2063,21 @@ def map(self, arg, na_action=None): y : Series same index as caller """ - values = self.asobject - if na_action == 'ignore': - mask = isnull(values) - - def map_f(values, f): - return lib.map_infer_mask(values, f, mask.view(np.uint8)) + if is_extension_type(self.dtype): + values = self._values + if na_action is not None: + raise NotImplementedError + map_f = lambda values, f: values.map(f) else: - map_f = lib.map_infer + values = self.asobject + + if na_action == 'ignore': + def map_f(values, f): + return lib.map_infer_mask(values, f, + isnull(values).view(np.uint8)) + else: + map_f = lib.map_infer if isinstance(arg, (dict, Series)): if isinstance(arg, dict): @@ -2079,12 +2085,11 @@ def map_f(values, f): indexer = arg.index.get_indexer(values) new_values = algos.take_1d(arg._values, indexer) - return self._constructor(new_values, - index=self.index).__finalize__(self) else: - mapped = map_f(values, arg) - return self._constructor(mapped, - index=self.index).__finalize__(self) + new_values = map_f(values, arg) + + return self._constructor(new_values, + index=self.index).__finalize__(self) def apply(self, func, convert_dtype=True, args=(), **kwds): """ @@ -2193,7 +2198,12 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): if isinstance(f, np.ufunc): return f(self) - mapped = lib.map_infer(self.asobject, f, convert=convert_dtype) + if is_extension_type(self.dtype): + mapped = self._values.map(f) + else: + values = self.asobject + mapped = lib.map_infer(values, f, convert=convert_dtype) + if len(mapped) and isinstance(mapped[0], Series): from pandas.core.frame import DataFrame return DataFrame(mapped.tolist(), index=self.index) @@ -2779,7 +2789,7 @@ def _try_cast(arr, take_fast_path): try: subarr = _possibly_cast_to_datetime(arr, dtype) - if not is_internal_type(subarr): + if not is_extension_type(subarr): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 644b6720dfaac..f606f4a649047 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2194,11 +2194,22 @@ def groupby(self, to_groupby): ------- groups : dict {group name -> group labels} - """ return self._groupby(self.values, _values_from_object(to_groupby)) def map(self, mapper): + """ + Apply mapper function to its values. + + Parameters + ---------- + mapper : callable + Function to be applied. + + Returns + ------- + applied : array + """ return self._arrmap(self.values, mapper) def isin(self, values, level=None): diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 16b8fd8df4e2a..98cb028aefae8 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -468,6 +468,24 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): na_value=-1) return self._create_from_codes(taken) + def map(self, mapper): + """ + Apply mapper function to its categories (not codes). + + Parameters + ---------- + mapper : callable + Function to be applied. When all categories are mapped + to different categories, the result will be Categorical which has + the same order property as the original. Otherwise, the result will + be np.ndarray. + + Returns + ------- + applied : Categorical or np.ndarray. + """ + return self.values.map(mapper) + def delete(self, loc): """ Make new Index with passed location(-s) deleted diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index a8534309c115c..fa8f6a291c677 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -201,6 +201,33 @@ def test_min_max(self): self.assertEqual(ci.min(), 'c') self.assertEqual(ci.max(), 'b') + def test_map(self): + ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), + ordered=True) + result = ci.map(lambda x: x.lower()) + exp = pd.Categorical(list('ababc'), categories=list('cba'), + ordered=True) + tm.assert_categorical_equal(result, exp) + + ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), + ordered=False, name='XXX') + result = ci.map(lambda x: x.lower()) + exp = pd.Categorical(list('ababc'), categories=list('bac'), + ordered=False) + tm.assert_categorical_equal(result, exp) + + tm.assert_numpy_array_equal(ci.map(lambda x: 1), np.array([1] * 5)) + + # change categories dtype + ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), + ordered=False) + def f(x): + return {'A': 10, 'B': 20, 'C': 30}.get(x) + result = ci.map(f) + exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], + ordered=False) + tm.assert_categorical_equal(result, exp) + def test_append(self): ci = self.create_index() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 9182b16d1f5b5..af648d34637df 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1567,6 +1567,25 @@ def test_sortlevel(self): res = s.sortlevel(['A', 'B'], sort_remaining=False) assert_series_equal(s, res) + def test_apply_categorical(self): + values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'), + ordered=True) + s = pd.Series(values, name='XX', index=list('abcdefg')) + result = s.apply(lambda x: x.lower()) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list('abbabcd'), categories=list('dcba'), + ordered=True) + exp = pd.Series(values, name='XX', index=list('abcdefg')) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = s.apply(lambda x: 'A') + exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg')) + tm.assert_series_equal(result, exp) + self.assertEqual(result.dtype, np.object) + def test_shift_int(self): ts = self.ts.astype(int) shifted = ts.shift(1) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 87369a0e6ef90..154837fc2a3b1 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -110,6 +110,32 @@ def test_apply_box(self): exp = pd.Series(['Period_M', 'Period_M']) tm.assert_series_equal(res, exp) + def test_apply_datetimetz(self): + values = pd.date_range('2011-01-01', '2011-01-02', + freq='H').tz_localize('Asia/Tokyo') + s = pd.Series(values, name='XX') + + result = s.apply(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range('2011-01-02', '2011-01-03', + freq='H').tz_localize('Asia/Tokyo') + exp = pd.Series(exp_values, name='XX') + tm.assert_series_equal(result, exp) + + # change dtype + result = s.apply(lambda x: x.hour) + exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32) + tm.assert_series_equal(result, exp) + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = pd.Series(['Asia/Tokyo'] * 25, name='XX') + tm.assert_series_equal(result, exp) + class TestSeriesMap(TestData, tm.TestCase): @@ -255,3 +281,53 @@ def test_map_box(self): x.freqstr)) exp = pd.Series(['Period_M', 'Period_M']) tm.assert_series_equal(res, exp) + + def test_map_categorical(self): + values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'), + ordered=True) + s = pd.Series(values, name='XX', index=list('abcdefg')) + + result = s.map(lambda x: x.lower()) + exp_values = pd.Categorical(list('abbabcd'), categories=list('dcba'), + ordered=True) + exp = pd.Series(exp_values, name='XX', index=list('abcdefg')) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp_values) + + result = s.map(lambda x: 'A') + exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg')) + tm.assert_series_equal(result, exp) + self.assertEqual(result.dtype, np.object) + + with tm.assertRaises(NotImplementedError): + s.map(lambda x: x, na_action='ignore') + + def test_map_datetimetz(self): + values = pd.date_range('2011-01-01', '2011-01-02', + freq='H').tz_localize('Asia/Tokyo') + s = pd.Series(values, name='XX') + + # keep tz + result = s.map(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range('2011-01-02', '2011-01-03', + freq='H').tz_localize('Asia/Tokyo') + exp = pd.Series(exp_values, name='XX') + tm.assert_series_equal(result, exp) + + # change dtype + result = s.map(lambda x: x.hour) + exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32) + tm.assert_series_equal(result, exp) + + with tm.assertRaises(NotImplementedError): + s.map(lambda x: x, na_action='ignore') + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = pd.Series(['Asia/Tokyo'] * 25, name='XX') + tm.assert_series_equal(result, exp) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a0e6241383289..a1cc05b0c9873 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1551,6 +1551,24 @@ def test_comparison_with_unknown_scalars(self): self.assert_numpy_array_equal(cat == 4, [False, False, False]) self.assert_numpy_array_equal(cat != 4, [True, True, True]) + def test_map(self): + c = pd.Categorical(list('ABABC'), categories=list('CBA'), + ordered=True) + result = c.map(lambda x: x.lower()) + exp = pd.Categorical(list('ababc'), categories=list('cba'), + ordered=True) + tm.assert_categorical_equal(result, exp) + + c = pd.Categorical(list('ABABC'), categories=list('ABC'), + ordered=False) + result = c.map(lambda x: x.lower()) + exp = pd.Categorical(list('ababc'), categories=list('abc'), + ordered=False) + tm.assert_categorical_equal(result, exp) + + result = c.map(lambda x: 1) + tm.assert_numpy_array_equal(result, np.array([1] * 5)) + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 89200ef79dac9..7a604d0e7341b 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -888,11 +888,11 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(res, exp): + assertIsInstance(res, pd.Categorical, '[Categorical] ') + assertIsInstance(exp, pd.Categorical, '[Categorical] ') + + assert_index_equal(res.categories, exp.categories) - if not array_equivalent(res.categories, exp.categories): - raise AssertionError( - 'categories not equivalent: {0} vs {1}.'.format(res.categories, - exp.categories)) if not array_equivalent(res.codes, exp.codes): raise AssertionError( 'codes not equivalent: {0} vs {1}.'.format(res.codes, exp.codes))