Skip to content

Commit 52c1bc8

Browse files
committed
fixup! BUG: Fix .groupby(categorical, sort=False) failing
1 parent c813146 commit 52c1bc8

File tree

4 files changed

+70
-24
lines changed

4 files changed

+70
-24
lines changed

doc/source/whatsnew/v0.20.0.txt

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,43 @@ Other enhancements
158158

159159
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
160160

161-
.. _whatsnew_0200.api_breaking:
161+
.. _whatsnew_0200.enhancements.groupy_categorical
162+
163+
GroupBy on Categoricals
164+
^^^^^^^^^^^^^^^^^^^^^^^
165+
166+
In previous version, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
167+
168+
Now, it works.
169+
170+
.. ipython: python
171+
172+
chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']]
173+
df = pd.DataFrame({
174+
'A': np.random.randint(100),
175+
'B': np.random.randint(100),
176+
'C': np.random.randint(100),
177+
'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100),
178+
categories=chromosomes,
179+
ordered=True)})
180+
181+
Previous Behavior:
162182

183+
.. code-block:: ipython
184+
185+
In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
186+
---------------------------------------------------------------------------
187+
ValueError Traceback (most recent call last)
188+
...
189+
ValueError: items in new_categories are not the same as in old categories
190+
191+
New Behavior:
192+
193+
.. code-block:: ipython
194+
195+
df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
196+
197+
.. _whatsnew_0200.api_breaking:
163198

164199
Backwards incompatible API changes
165200
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -539,7 +574,6 @@ Bug Fixes
539574

540575
- Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`)
541576

542-
- Bug in ``.groupby`` where ```.groupby(categorical, sort=False)`` would raise ``ValueError`` due to non-matching categories (:issue:`13179`)
543577

544578

545579

pandas/core/categorical.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,35 @@ def _get_categories(self):
602602
categories = property(fget=_get_categories, fset=_set_categories,
603603
doc=_categories_doc)
604604

605+
def _codes_for_groupby(self, sort):
606+
"""
607+
Return a Categorical adjusted for groupby
608+
609+
Parameters
610+
----------
611+
sort : boolean
612+
The value of the sort paramter groupby was called with.
613+
614+
Returns
615+
-------
616+
Categorical
617+
In case of sort=True, self is returned with original categories
618+
preserved. In case of sort=False, the new categories are set
619+
to the order of appearance in codes (unless ordered=True),
620+
followed by any unrepresented categories in original order.
621+
"""
622+
cat = self
623+
# sort=False should order groups in as-encountered order (GH-8868)
624+
if not sort:
625+
cat = self.unique()
626+
# But all categories should be present, including those missing
627+
# from the data (GH-13179), which .unique() dropped
628+
cat.add_categories(self.categories[
629+
~self.categories.isin(cat.categories)],
630+
inplace=True)
631+
cat = self.reorder_categories(cat.categories)
632+
return cat
633+
605634
_ordered = None
606635

607636
def set_ordered(self, value, inplace=False):

pandas/core/groupby.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2300,28 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
23002300
# a passed Categorical
23012301
elif is_categorical_dtype(self.grouper):
23022302

2303-
# must have an ordered categorical
2304-
if self.sort:
2305-
if not self.grouper.ordered:
2306-
2307-
# technically we cannot group on an unordered
2308-
# Categorical
2309-
# but this a user convenience to do so; the ordering
2310-
# is preserved and if it's a reduction it doesn't make
2311-
# any difference
2312-
pass
2313-
2314-
# fix bug #GH8868 sort=False being ignored in categorical
2315-
# groupby
2316-
else:
2317-
cat = self.grouper.unique()
2318-
all_categories = self.grouper.categories
2319-
cat.add_categories(
2320-
all_categories[
2321-
~all_categories.isin(cat.categories)],
2322-
inplace=True) # GH-13179
2323-
self.grouper = self.grouper.reorder_categories(
2324-
cat.categories)
2303+
self.grouper = self.grouper._codes_for_groupby(self.sort)
23252304

23262305
# we make a CategoricalIndex out of the cat grouper
23272306
# preserving the categories / ordered attributes

pandas/indexes/category.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name):
550550
result.name = name
551551
return result
552552

553+
def _codes_for_groupby(self, sort):
554+
""" Return a Categorical adjusted for groupby """
555+
return self.values._codes_for_groupby(sort)
556+
553557
@classmethod
554558
def _add_comparison_methods(cls):
555559
""" add in comparison methods """

0 commit comments

Comments
 (0)