refactor cast label indexer to coord dtype

benbovy · benbovy · commit cc2d9c9c90c9 · 2021-08-31T13:59:09.000+02:00
Make the fix in pydata#3153 specific to pandas indexes (i.e., do not apply it to other, custom indexes). See pydata#5697 for details. This should also fix pydata#5700 although no test has been added yet (we need to refactor set_index first).
diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
@@ -129,9 +129,12 @@ def _is_nested_tuple(possible_tuple):
     )
 
 
-def normalize_label(value, extract_scalar=False):
+def normalize_label(value, extract_scalar=False, dtype=None):
     if getattr(value, "ndim", 1) <= 1:
         value = _asarray_tuplesafe(value)
+    if dtype is not None and dtype.kind == "f":
+        # see https://github.com/pydata/xarray/pull/3153 for details
+        value = np.asarray(value, dtype=dtype)
     if extract_scalar:
         # see https://github.com/pydata/xarray/pull/4292 for details
         value = value[()] if value.dtype.kind in "mM" else value.item()
@@ -151,12 +154,16 @@ def get_indexer_nd(index, labels, method=None, tolerance=None):
 class PandasIndex(Index):
     """Wrap a pandas.Index as an xarray compatible index."""
 
-    __slots__ = ("index", "dim")
+    __slots__ = ("index", "dim", "coord_dtype")
 
-    def __init__(self, array: Any, dim: Hashable):
+    def __init__(self, array: Any, dim: Hashable, coord_dtype: Any = None):
         self.index = utils.safe_cast_to_index(array)
         self.dim = dim
 
+        if coord_dtype is None:
+            coord_dtype = self.index.dtype
+        self.coord_dtype = coord_dtype
+
     @classmethod
     def from_variables(cls, variables: Mapping[Hashable, "Variable"]):
         from .variable import IndexVariable
@@ -176,7 +183,7 @@ def from_variables(cls, variables: Mapping[Hashable, "Variable"]):
 
         dim = var.dims[0]
 
-        obj = cls(var.data, dim)
+        obj = cls(var.data, dim, coord_dtype=var.dtype)
 
         data = PandasIndexingAdapter(obj.index, dtype=var.dtype)
         index_var = IndexVariable(
@@ -219,7 +226,7 @@ def query(self, labels, method=None, tolerance=None):
                 "a dimension that does not have a MultiIndex"
             )
         else:
-            label = normalize_label(label)
+            label = normalize_label(label, dtype=self.coord_dtype)
             if label.ndim == 0:
                 label_value = normalize_label(label, extract_scalar=True)
                 if isinstance(self.index, pd.CategoricalIndex):
@@ -289,6 +296,16 @@ def _create_variables_from_multiindex(index, dim, level_meta=None):
 
 
 class PandasMultiIndex(PandasIndex):
+
+    __slots__ = ("index", "dim", "coord_dtype", "level_coords_dtype")
+
+    def __init__(self, array: Any, dim: Hashable, level_coords_dtype: Any = None):
+        super().__init__(array, dim)
+
+        if level_coords_dtype is None:
+            level_coords_dtype = {idx.name: idx.dtype for idx in self.index.levels}
+        self.level_coords_dtype = level_coords_dtype
+
     @classmethod
     def from_variables(cls, variables: Mapping[Hashable, "Variable"]):
         if any([var.ndim != 1 for var in variables.values()]):
@@ -305,7 +322,8 @@ def from_variables(cls, variables: Mapping[Hashable, "Variable"]):
         index = pd.MultiIndex.from_arrays(
             [var.values for var in variables.values()], names=variables.keys()
         )
-        obj = cls(index, dim)
+        level_coords_dtype = {name: var.dtype for name, var in variables.items()}
+        obj = cls(index, dim, level_coords_dtype=level_coords_dtype)
 
         level_meta = {
             name: {"dtype": var.dtype, "attrs": var.attrs, "encoding": var.encoding}
@@ -346,7 +364,10 @@ def query(self, labels, method=None, tolerance=None):
         if all([lbl in self.index.names for lbl in labels]):
             is_nested_vals = _is_nested_tuple(tuple(labels.values()))
             labels = {
-                k: normalize_label(v, extract_scalar=True) for k, v in labels.items()
+                k: normalize_label(
+                    v, extract_scalar=True, dtype=self.level_coords_dtype[k]
+                )
+                for k, v in labels.items()
             }
 
             if len(labels) == self.index.nlevels and not is_nested_vals:
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -32,7 +32,6 @@
     is_duck_dask_array,
     sparse_array_type,
 )
-from .utils import maybe_cast_to_coords_dtype
 
 if TYPE_CHECKING:
     from .dataarray import DataArray
@@ -185,12 +184,10 @@ def group_indexers_by_index(
 
     for key, label in indexers.items():
         index = obj.xindexes.get(key, None)
-        coord = obj.coords.get(key, None)
 
         if index is not None:
             index_id = id(index)
             unique_indexes[index_id] = index
-            label = maybe_cast_to_coords_dtype(label, coord.dtype)  # type: ignore
             grouped_indexers[index_id][key] = label
         elif key in obj.coords:
             raise KeyError(f"no index found for coordinate {key}")
diff --git a/xarray/core/utils.py b/xarray/core/utils.py
@@ -72,12 +72,6 @@ def _maybe_cast_to_cftimeindex(index: pd.Index) -> pd.Index:
         return index
 
 
-def maybe_cast_to_coords_dtype(label, coords_dtype):
-    if coords_dtype.kind == "f" and not isinstance(label, slice):
-        label = np.asarray(label, dtype=coords_dtype)
-    return label
-
-
 def maybe_coerce_to_str(index, original_coords):
     """maybe coerce a pandas Index back to a nunpy array of type str