Skip to content

Add type hints to dtypes/dtypes.py (CategoricalDtype) #26327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def ordered(self):
return self.dtype.ordered

@property
def dtype(self):
def dtype(self) -> CategoricalDtype:
"""
The :class:`~pandas.api.types.CategoricalDtype` for this instance
"""
Expand Down
72 changes: 45 additions & 27 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" define extension dtypes """
import re
from typing import Any, Dict, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type, Union
import warnings

import numpy as np
Expand All @@ -18,7 +18,8 @@
str_type = str


def register_extension_dtype(cls):
def register_extension_dtype(cls: Type[ExtensionDtype],
) -> Type[ExtensionDtype]:
"""
Register an ExtensionType with pandas as class decorator.

Expand Down Expand Up @@ -60,9 +61,9 @@ class Registry:
These are tried in order.
"""
def __init__(self):
self.dtypes = []
self.dtypes = [] # type: List[Type[ExtensionDtype]]

def register(self, dtype):
def register(self, dtype: Type[ExtensionDtype]) -> None:
"""
Parameters
----------
Expand All @@ -73,11 +74,13 @@ def register(self, dtype):

self.dtypes.append(dtype)

def find(self, dtype):
def find(self,
dtype: Union[Type[ExtensionDtype], str],
) -> Optional[Type[ExtensionDtype]]:
"""
Parameters
----------
dtype : PandasExtensionDtype or string
dtype : Type[ExtensionDtype] or string

Returns
-------
Expand Down Expand Up @@ -126,28 +129,28 @@ class PandasExtensionDtype(ExtensionDtype):
isnative = 0
_cache = {} # type: Dict[str_type, 'PandasExtensionDtype']

def __str__(self):
def __str__(self) -> str_type:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd on your how-to-type-annotate can you add when / why to use str_type

"""
Return a string representation for a particular Object
"""
return self.name

def __repr__(self):
def __repr__(self) -> str_type:
"""
Return a string representation for a particular object.
"""
return str(self)

def __hash__(self):
def __hash__(self) -> int:
raise NotImplementedError("sub-classes should implement an __hash__ "
"method")

def __getstate__(self):
def __getstate__(self) -> Dict[str_type, Any]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would like to minimize the use of Any as much as possible. Can't the value here not be str as well? May need to add type to self._metadata to really make use of this

Copy link
Contributor

@TomAugspurger TomAugspurger May 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Dict[str, Union[Optional[Index], bool]] will do it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__getstate__ is also used by subclasses, so the strings in _metadata that I find are needed are (with associated guess for types for attributes):

  • "categories": Optional[Index]
  • "ordered": Optional[bool]
  • "unit": str (+other?)
  • "tz": str, I think (+ others?)
  • "freq": Not sure ATM
  • "subtype": Not sure ATM (_typing.Dtype?)

If the type hints are important to be specific to the ExtensionsDtype, maybe subclass __getstate__ in order to set the correct type hints? (in different PR IMO)?

# pickle support; we don't want to pickle the cache
return {k: getattr(self, k, None) for k in self._metadata}

@classmethod
def reset_cache(cls):
def reset_cache(cls) -> None:
""" clear the cache """
cls._cache = {}

Expand Down Expand Up @@ -211,17 +214,24 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
_metadata = ('categories', 'ordered')
_cache = {} # type: Dict[str_type, PandasExtensionDtype]

def __init__(self, categories=None, ordered=None):
def __init__(self, categories=None, ordered: bool = None):
self._finalize(categories, ordered, fastpath=False)

@classmethod
def _from_fastpath(cls, categories=None, ordered=None):
def _from_fastpath(cls,
categories=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you annotate categories?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only imprecisely. The most similar would be a plain Iterable, but that would accept e.g. "categories='abc'", which would ideally flag as a wrong type in mypy, but that's not possible ATM.

I'm a bit apprehensive about accepting wrong types, but maybe that would be the correct compromise.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

categories should be ListLike which is ArrayLike + list + tuple + Iterable (define it in _typing)

ordered: bool = None
) -> 'CategoricalDtype':
self = cls.__new__(cls)
self._finalize(categories, ordered, fastpath=True)
return self

@classmethod
def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
def _from_categorical_dtype(cls,
dtype: 'CategoricalDtype',
categories=None,
ordered: bool = None,
) -> 'CategoricalDtype':
if categories is ordered is None:
return dtype
if categories is None:
Expand All @@ -231,8 +241,12 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
return cls(categories, ordered)

@classmethod
def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,
dtype=None):
def _from_values_or_dtype(cls,
values=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you annotate values / categories?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above. The most precise here would again be Iterable, but same problems as above.

categories=None,
ordered: bool = None,
dtype: 'CategoricalDtype' = None,
) -> 'CategoricalDtype':
"""
Construct dtype from the input parameters used in :class:`Categorical`.

Expand Down Expand Up @@ -314,7 +328,11 @@ def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,

return dtype

def _finalize(self, categories, ordered, fastpath=False):
def _finalize(self,
categories,
ordered: Optional[bool],
fastpath: bool = False,
) -> None:

if ordered is not None:
self.validate_ordered(ordered)
Expand All @@ -326,14 +344,14 @@ def _finalize(self, categories, ordered, fastpath=False):
self._categories = categories
self._ordered = ordered

def __setstate__(self, state):
def __setstate__(self, state: Dict[str_type, Any]) -> None:
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
self._categories = state.pop('categories', None)
self._ordered = state.pop('ordered', False)

def __hash__(self):
def __hash__(self) -> int:
# _hash_categories returns a uint64, so use the negative
# space for when we have unknown categories to avoid a conflict
if self.categories is None:
Expand All @@ -344,7 +362,7 @@ def __hash__(self):
# We *do* want to include the real self.ordered here
return int(self._hash_categories(self.categories, self.ordered))

def __eq__(self, other):
def __eq__(self, other: Any) -> bool:
"""
Rules for CDT equality:
1) Any CDT is equal to the string 'category'
Expand Down Expand Up @@ -391,7 +409,7 @@ def __repr__(self):
return tpl.format(data, self.ordered)

@staticmethod
def _hash_categories(categories, ordered=True):
def _hash_categories(categories, ordered: Optional[bool] = True) -> int:
from pandas.core.util.hashing import (
hash_array, _combine_hash_arrays, hash_tuples
)
Expand Down Expand Up @@ -441,7 +459,7 @@ def construct_array_type(cls):
return Categorical

@staticmethod
def validate_ordered(ordered):
def validate_ordered(ordered: bool) -> None:
"""
Validates that we have a valid ordered parameter. If
it is not a boolean, a TypeError will be raised.
Expand All @@ -461,7 +479,7 @@ def validate_ordered(ordered):
raise TypeError("'ordered' must either be 'True' or 'False'")

@staticmethod
def validate_categories(categories, fastpath=False):
def validate_categories(categories, fastpath: bool = False):
"""
Validates that we have good categories

Expand All @@ -475,7 +493,7 @@ def validate_categories(categories, fastpath=False):
-------
categories : Index
"""
from pandas import Index
from pandas.core.indexes.base import Index

if not fastpath and not is_list_like(categories):
msg = "Parameter 'categories' must be list-like, was {!r}"
Expand All @@ -496,7 +514,7 @@ def validate_categories(categories, fastpath=False):

return categories

def update_dtype(self, dtype):
def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
"""
Returns a CategoricalDtype with categories and ordered taken from dtype
if specified, otherwise falling back to self if unspecified
Expand Down Expand Up @@ -538,14 +556,14 @@ def categories(self):
return self._categories

@property
def ordered(self):
def ordered(self) -> Optional[bool]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this optional? I thought ordered was always sure or values (categories are optional).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes:

In [1]: pd.CategoricalDtype()
Out[1]: CategoricalDtype(categories=None, ordered=None)

I think this has somwthing to to with updating dtypes (don't update None, but do update False).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

grr, this isn't needed, and isn't how we document things. #26336

"""
Whether the categories have an ordered relationship.
"""
return self._ordered

@property
def _is_boolean(self):
def _is_boolean(self) -> bool:
from pandas.core.dtypes.common import is_bool_dtype

return is_bool_dtype(self.categories)
Expand Down