From 22004e4287f0992d838794a5815e0776f73b0353 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Sep 2021 17:04:21 -0400 Subject: [PATCH 01/43] sketching out changes needed to integrate variables into DataTree --- datatree/datatree.py | 273 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 216 insertions(+), 57 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index fbcec02b..63d17591 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -1,11 +1,25 @@ from __future__ import annotations import textwrap -from typing import Any, Callable, Dict, Hashable, Iterable, List, Mapping, Union +from typing import ( + Any, + Callable, + Dict, + Hashable, + Iterable, + Iterator, + List, + Mapping, + Optional, + Set, + Union, +) import anytree from xarray import DataArray, Dataset, merge from xarray.core import dtypes, utils +from xarray.core.indexes import Index +from xarray.core.utils import Default, Frozen, _default from xarray.core.variable import Variable from .mapping import map_over_subtree @@ -14,7 +28,7 @@ MappedDatasetMethodsMixin, MappedDataWithCoords, ) -from .treenode import PathType, TreeNode, _init_single_treenode +from .treenode import PathType, TreeNode """ DEVELOPERS' NOTE @@ -30,6 +44,13 @@ """ +def _check_for_name_collisions(children, variables): + for child in children: + for var in variables: + if var == child.name: + raise KeyError + + class DataTree( TreeNode, MappedDatasetMethodsMixin, @@ -59,8 +80,6 @@ class DataTree( DataNode : Shortcut to create a DataTree with only a single node. """ - # TODO should this instead be a subclass of Dataset? - # TODO attribute-like access for both vars and child nodes (by inheriting from xarray.core.common.AttrsAccessMixin?) # TODO ipython autocomplete for child nodes @@ -71,90 +90,94 @@ class DataTree( # TODO do we need a watch out for if methods intended only for root nodes are called on non-root nodes? - # TODO currently allows self.ds = None, should we instead always store at least an empty Dataset? - # TODO dataset methods which should not or cannot act over the whole tree, such as .to_array # TODO del and delitem methods # TODO .loc, __contains__, __iter__, __array__, __len__ + _attrs: Optional[Dict[Hashable, Any]] + _cache: Dict[str, Any] + _coord_names: Set[Hashable] + _dims: Dict[Hashable, int] + _encoding: Optional[Dict[Hashable, Any]] + _close: Optional[Callable[[], None]] + _indexes: Optional[Dict[Hashable, Index]] + _variables: Dict[Hashable, Variable] + + # TODO a lot of properties like .variables could be defined in a DataMapping class which both Dataset and DataTree inherit from + + # TODO __slots__ + + # TODO all groupby classes + def __init__( self, + name: Hashable, + parent: DataTree = None, + children: Iterable[DataTree] = None, + data_vars: Mapping[Any, Any] = None, + coords: Mapping[Any, Any] = None, + attrs: Mapping[Any, Any] = None, + ): + _check_for_name_collisions(children, data_vars) + + super().__init__(name, parent, children=children) + Dataset.__init__(self, data_vars=data_vars, coords=coords, attrs=attrs) + + @classmethod + def from_dict( + cls, data_objects: Dict[PathType, Union[Dataset, DataArray, None]] = None, name: Hashable = "root", ): + # First create the root node - super().__init__(name=name, parent=None, children=None) if data_objects: root_data = data_objects.pop(name, None) else: root_data = None - self._ds = root_data + tree = cls.from_data(name, root_data) if data_objects: # Populate tree with children determined from data_objects mapping for path, data in data_objects.items(): # Determine name of new node - path = self._tuple_or_path_to_path(path) - if self.separator in path: - node_path, node_name = path.rsplit(self.separator, maxsplit=1) + path = tree._tuple_or_path_to_path(path) + if tree.separator in path: + node_path, node_name = path.rsplit(tree.separator, maxsplit=1) else: node_path, node_name = "/", path - relative_path = node_path.replace(self.name, "") + relative_path = node_path.replace(tree.name, "") # Create and set new node - new_node = DataNode(name=node_name, data=data) - self.set_node( + new_node = cls.from_data(name=node_name, data=data) + tree.set_node( relative_path, new_node, allow_overwrite=False, new_nodes_along_path=True, ) - - @property - def ds(self) -> Dataset: - return self._ds - - @ds.setter - def ds(self, data: Union[Dataset, DataArray] = None): - if not isinstance(data, (Dataset, DataArray)) and data is not None: - raise TypeError( - f"{type(data)} object is not an xarray Dataset, DataArray, or None" - ) - if isinstance(data, DataArray): - data = data.to_dataset() - if data is not None: - for var in list(data.variables): - if var in list(c.name for c in self.children): - raise KeyError( - f"Cannot add variable named {var}: node already has a child named {var}" - ) - self._ds = data - - @property - def has_data(self): - return self.ds is not None + return tree @classmethod - def _init_single_datatree_node( + def from_data( cls, name: Hashable, - data: Union[Dataset, DataArray] = None, + data: Union[Dataset, DataArray], parent: TreeNode = None, children: List[TreeNode] = None, ): """ - Create a single node of a DataTree, which optionally contains data in the form of an xarray.Dataset. + Create a single node of a DataTree from an xarray Dataset or DataArray. Parameters ---------- name : Hashable Name for the root node of the tree. Default is "root" - data : Dataset, DataArray, Variable or None, optional - Data to store under the .ds attribute of this node. DataArrays and Variables will be promoted to Datasets. - Default is None. + data : Dataset or DataArray + xarray data object whose data to store in this node. parent : TreeNode, optional Parent node to this node. Default is None. children : Sequence[TreeNode], optional @@ -164,12 +187,24 @@ def _init_single_datatree_node( ------- node : DataTree """ + if isinstance(data, DataArray): + dataset = data.to_dataset() + elif isinstance(data, Dataset): + dataset = data + else: + raise TypeError + + _check_for_name_collisions(children, dataset.variables) - # This approach was inspired by xarray.Dataset._construct_direct() obj = object.__new__(cls) - obj._ds = None - obj = _init_single_treenode(obj, name=name, parent=parent, children=children) - obj.ds = data + obj.__init__( + name=name, + parent=parent, + children=children, + data_vars=dataset.data_vars, + coords=dataset.coords, + attrs=dataset.attrs, + ) return obj def _pre_attach(self, parent: TreeNode) -> None: @@ -178,10 +213,8 @@ def _pre_attach(self, parent: TreeNode) -> None: children with duplicate names (or a data variable with the same name as a child). """ super()._pre_attach(parent) - if parent.has_data and self.name in list(parent.ds.variables): - raise KeyError( - f"parent {parent.name} already contains a data variable named {self.name}" - ) + if parent.has_data: + _check_for_name_collisions([self.name], parent.variables) def add_child(self, child: TreeNode) -> None: """ @@ -191,11 +224,140 @@ def add_child(self, child: TreeNode) -> None: """ if child.name in list(c.name for c in self.children): raise KeyError(f"Node already has a child named {child.name}") - elif self.has_data and child.name in list(self.ds.variables): + elif self.has_data and child.name in list(self.variables): raise KeyError(f"Node already contains a data variable named {child.name}") else: child.parent = self + @property + def variables(self) -> Mapping[Hashable, Variable]: + """Low level interface to Dataset contents as dict of Variable objects. + + This ordered dictionary is frozen to prevent mutation that could + violate Dataset invariants. It contains all variable objects + constituting the Dataset, including both data variables and + coordinates. + """ + return Frozen(self._variables) + + @property + def attrs(self) -> Dict[Hashable, Any]: + """Dictionary of global attributes on this dataset""" + if self._attrs is None: + self._attrs = {} + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping[Any, Any]) -> None: + self._attrs = dict(value) + + @property + def encoding(self) -> Dict: + """Dictionary of global encoding attributes on this dataset""" + if self._encoding is None: + self._encoding = {} + return self._encoding + + @encoding.setter + def encoding(self, value: Mapping) -> None: + self._encoding = dict(value) + + @property + def dims(self) -> Mapping[Hashable, int]: + """Mapping from dimension names to lengths. + + Cannot be modified directly, but is updated when adding new variables. + + Note that type of this object differs from `DataArray.dims`. + See `Dataset.sizes` and `DataArray.sizes` for consistently named + properties. + """ + return Frozen(self._dims) + + @property + def sizes(self) -> Mapping[Hashable, int]: + """Mapping from dimension names to lengths. + + Cannot be modified directly, but is updated when adding new variables. + + This is an alias for `Dataset.dims` provided for the benefit of + consistency with `DataArray.sizes`. + + See Also + -------- + DataArray.sizes + """ + return self.dims + + @property + def has_data(self): + return len(self._variables) > 0 + + def __contains__(self, key: object) -> bool: + """The 'in' operator will return true or false depending on whether + 'key' is either an array stored in the datatree or a child node, or neither. + """ + return key in self._variables or key in self.children + + def __len__(self) -> int: + return len(self.data_vars) + + def __bool__(self) -> bool: + return bool(self.data_vars) + + def __iter__(self) -> Iterator[Hashable]: + return iter(self.data_vars) + + @classmethod + def _construct_direct( + cls, + variables, + coord_names, + dims=None, + attrs=None, + indexes=None, + encoding=None, + close=None, + ): + """Shortcut around __init__ for internal use when we want to skip + costly validation + """ + return NotImplementedError + + def _replace( + self, + variables: Dict[Hashable, Variable] = None, + coord_names: Set[Hashable] = None, + dims: Dict[Any, int] = None, + attrs: Union[Dict[Hashable, Any], None, Default] = _default, + indexes: Union[Dict[Any, Index], None, Default] = _default, + encoding: Union[dict, None, Default] = _default, + inplace: bool = False, + ) -> "Dataset": + """Fastpath constructor for internal use. + + Returns an object with optionally with replaced attributes. + + Explicitly passed arguments are *not* copied when placed on the new + dataset. It is up to the caller to ensure that they have the right type + and are not used elsewhere. + """ + _check_for_name_collisions(self.children, variables) + + # TODO I don't really know the best way to do this without inheritance + + ds = Dataset._replace( + self, + variables=variables, + coord_names=coord_names, + dims=dims, + attrs=attrs, + indexes=indexes, + encoding=encoding, + inplace=inplace, + ) + return self.from_data(name=self.name, data=ds) + def __str__(self): """A printable representation of the structure of this entire subtree.""" renderer = anytree.RenderTree(self) @@ -387,7 +549,7 @@ def __setitem__( else: # if nothing there then make new node based on type of object if isinstance(value, (Dataset, DataArray, Variable)) or value is None: - new_node = DataNode(name=last_tag, data=value) + new_node = self.from_data(name=last_tag, data=value) self.set_node(path=path_tags, node=new_node) elif isinstance(value, TreeNode): self.set_node(path=path, node=value) @@ -602,6 +764,3 @@ def to_zarr(self, store, mode: str = "w", encoding=None, **kwargs): def plot(self): raise NotImplementedError - - -DataNode = DataTree._init_single_datatree_node From 3a4f87480000b0da36cca398969dfe936b0d7daa Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 29 Apr 2022 13:58:22 -0400 Subject: [PATCH 02/43] fixed some other basic conflicts --- datatree/datatree.py | 55 +++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index d9df33ae..5678eeca 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -1,32 +1,27 @@ from __future__ import annotations - -from typing import ( - Dict, - Hashable, - Iterator, - Iterable, - Tuple, - List, - Set, -) - -from xarray.core.indexes import Index -from xarray.core.utils import Default, Frozen, _default from collections import OrderedDict from typing import ( TYPE_CHECKING, Any, Callable, + Dict, Generic, + Hashable, + Iterable, + Iterator, Mapping, MutableMapping, Optional, + Set, + Tuple, Union, ) from xarray import DataArray, Dataset from xarray.core import utils +from xarray.core.indexes import Index +from xarray.core.utils import Default, Frozen, _default from xarray.core.variable import Variable from .formatting import tree_repr @@ -151,11 +146,18 @@ def __init__( elif data is None: ds = Dataset() else: - raise TypeError(f"{type(data)} object is not an xarray Dataset, DataArray, or None") + raise TypeError( + f"{type(data)} object is not an xarray Dataset, DataArray, or None" + ) + + _check_for_name_collisions(children, ds.variables) - _check_for_name_collisions(children, data.variables) + # set tree attributes + super().__init__(children=children) + self.name = name + self.parent = parent - super().__init__(name, parent, children=children) + # set data attributes self._close = ds._close self._encoding = ds._encoding self._variables = ds._variables @@ -209,17 +211,26 @@ def ds(self, data: Union[Dataset, DataArray] = None) -> None: self._ds = data def to_dataset(self) -> Dataset: - return Dataset._construct_direct(self._variables, self._coord_names, ...) + """Return the data in this node as a new xarray Dataset object.""" + return Dataset._construct_direct( + self._variables, + self._coord_names, + self._dims, + self._attrs, + self._indexes, + self._encoding, + self._close, + ) @property - def has_data(self) -> bool: + def has_data(self): """Whether or not there are any data variables in this node.""" - return len(self.ds.variables) > 0 + return len(self._variables) > 0 @property def has_attrs(self) -> bool: """Whether or not there are any metadata attributes in this node.""" - return len(self.ds.attrs.keys()) > 0 + return len(self.attrs.keys()) > 0 @property def is_empty(self) -> bool: @@ -298,10 +309,6 @@ def sizes(self) -> Mapping[Hashable, int]: """ return self.dims - @property - def has_data(self): - return len(self._variables) > 0 - def __contains__(self, key: object) -> bool: """The 'in' operator will return true or false depending on whether 'key' is either an array stored in the datatree or a child node, or neither. From 8c6a68a7be4fff79e3c99232e9d7c30c8d8f8fff Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 29 Apr 2022 14:14:48 -0400 Subject: [PATCH 03/43] fix mypy errors --- datatree/datatree.py | 130 ++++++++++--------------------------------- 1 file changed, 30 insertions(+), 100 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 5678eeca..3240ab2c 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -21,7 +21,7 @@ from xarray import DataArray, Dataset from xarray.core import utils from xarray.core.indexes import Index -from xarray.core.utils import Default, Frozen, _default +from xarray.core.utils import Frozen from xarray.core.variable import Variable from .formatting import tree_repr @@ -54,11 +54,14 @@ T_Path = Union[str, NodePath] -def _check_for_name_collisions(children, variables): - for child in children: - for var in variables: - if var == child.name: - raise KeyError +def _check_for_name_collisions( + children: Iterable[str], variables: Iterable[Hashable] +) -> None: + colliding_names = set(children).intersection(set(variables)) + if colliding_names: + raise KeyError( + f"Some names would collide between variables and children: {list(colliding_names)}" + ) class DataTree( @@ -139,31 +142,13 @@ def __init__( DataTree.from_dict """ - if isinstance(data, DataArray): - ds = data.to_dataset() - elif isinstance(data, Dataset): - ds = data - elif data is None: - ds = Dataset() - else: - raise TypeError( - f"{type(data)} object is not an xarray Dataset, DataArray, or None" - ) - - _check_for_name_collisions(children, ds.variables) - # set tree attributes super().__init__(children=children) self.name = name self.parent = parent # set data attributes - self._close = ds._close - self._encoding = ds._encoding - self._variables = ds._variables - self._coord_names = ds._coord_names - self._dims = ds._dims - self._indexes = ds._indexes + self.ds = data @property def name(self) -> str | None: @@ -188,27 +173,31 @@ def parent(self: DataTree, new_parent: DataTree) -> None: @property def ds(self) -> Dataset: """The data in this node, returned as a Dataset.""" - return self._ds + # TODO change this to return only an immutable view onto this node's data (see GH #80) + return self.to_dataset() @ds.setter def ds(self, data: Union[Dataset, DataArray] = None) -> None: - if not isinstance(data, (Dataset, DataArray)) and data is not None: - raise TypeError( - f"{type(data)} object is not an xarray Dataset, DataArray, or None" - ) if isinstance(data, DataArray): - data = data.to_dataset() + ds = data.to_dataset() + elif isinstance(data, Dataset): + ds = data elif data is None: - data = Dataset() + ds = Dataset() + else: + raise TypeError( + f"{type(data)} object is not an xarray Dataset, DataArray, or None" + ) - for var in list(data.variables): - if var in self.children: - raise KeyError( - f"Cannot add variable named {var}: node already has a child named {var}" - ) + _check_for_name_collisions(self.children, ds.variables) - self._ds = data + self._close = ds._close + self._encoding = ds._encoding + self._variables = ds._variables + self._coord_names = ds._coord_names + self._dims = ds._dims + self._indexes = ds._indexes def to_dataset(self) -> Dataset: """Return the data in this node as a new xarray Dataset object.""" @@ -237,15 +226,6 @@ def is_empty(self) -> bool: """False if node contains any data or attrs. Does not look at children.""" return not (self.has_data or self.has_attrs) - def _pre_attach(self: DataTree, parent: DataTree) -> None: - """ - Method which superclass calls before setting parent, here used to prevent having two - children with duplicate names (or a data variable with the same name as a child). - """ - super()._pre_attach(parent) - if parent.has_data: - _check_for_name_collisions([self.name], parent.variables) - def __repr__(self): return tree_repr(self) @@ -316,63 +296,13 @@ def __contains__(self, key: object) -> bool: return key in self._variables or key in self.children def __len__(self) -> int: - return len(self.data_vars) + return len(self.ds.data_vars) def __bool__(self) -> bool: - return bool(self.data_vars) + return bool(self.ds.data_vars) def __iter__(self) -> Iterator[Hashable]: - return iter(self.data_vars) - - @classmethod - def _construct_direct( - cls, - variables, - coord_names, - dims=None, - attrs=None, - indexes=None, - encoding=None, - close=None, - ): - """Shortcut around __init__ for internal use when we want to skip - costly validation - """ - return NotImplementedError - - def _replace( - self, - variables: Dict[Hashable, Variable] = None, - coord_names: Set[Hashable] = None, - dims: Dict[Any, int] = None, - attrs: Union[Dict[Hashable, Any], None, Default] = _default, - indexes: Union[Dict[Any, Index], None, Default] = _default, - encoding: Union[dict, None, Default] = _default, - inplace: bool = False, - ) -> "Dataset": - """Fastpath constructor for internal use. - - Returns an object with optionally with replaced attributes. - - Explicitly passed arguments are *not* copied when placed on the new - dataset. It is up to the caller to ensure that they have the right type - and are not used elsewhere. - """ - _check_for_name_collisions(self.children, variables) - - # TODO I don't really know the best way to do this without inheritance - - ds = Dataset._replace( - self, - variables=variables, - coord_names=coord_names, - dims=dims, - attrs=attrs, - indexes=indexes, - encoding=encoding, - inplace=inplace, - ) - return self.from_data(name=self.name, data=ds) + return iter(self.ds.data_vars) def __str__(self): return tree_repr(self) From b503b06d64c27688e197818c057ace99c37dbfa2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 29 Apr 2022 14:25:53 -0400 Subject: [PATCH 04/43] can create basic datatree node objects again --- datatree/datatree.py | 6 ++++-- datatree/tests/test_datatree.py | 4 ++-- datatree/treenode.py | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 3240ab2c..a68c4c6b 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -192,12 +192,14 @@ def ds(self, data: Union[Dataset, DataArray] = None) -> None: _check_for_name_collisions(self.children, ds.variables) - self._close = ds._close - self._encoding = ds._encoding + # TODO this should probably be changed to use .replace, and this explicit setting of attributes reserved for constructors self._variables = ds._variables self._coord_names = ds._coord_names self._dims = ds._dims self._indexes = ds._indexes + self._attrs = ds._attrs + self._close = ds._close + self._encoding = ds._encoding def to_dataset(self) -> Dataset: """Return the data in this node as a new xarray Dataset object.""" diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 3dc3c09b..5b613118 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -77,7 +77,7 @@ class TestStoreDatasets: def test_create_with_data(self): dat = xr.Dataset({"a": 0}) john = DataTree(name="john", data=dat) - assert john.ds is dat + xrt.assert_identical(john.ds, dat) with pytest.raises(TypeError): DataTree(name="mary", parent=john, data="junk") # noqa @@ -86,7 +86,7 @@ def test_set_data(self): john = DataTree(name="john") dat = xr.Dataset({"a": 0}) john.ds = dat - assert john.ds is dat + xrt.assert_identical(john.ds, dat) with pytest.raises(TypeError): john.ds = "junk" diff --git a/datatree/treenode.py b/datatree/treenode.py index f4c0c77f..36b24574 100644 --- a/datatree/treenode.py +++ b/datatree/treenode.py @@ -134,7 +134,9 @@ def _detach(self, parent: Tree | None) -> None: def _attach(self, parent: Tree | None, child_name: str = None) -> None: if parent is not None: if child_name is None: - raise ValueError() + raise ValueError( + "To directly set parent, child needs a name, but child is unnamed" + ) self._pre_attach(parent) parentchildren = parent._children From 1efd7f2c1ec8f12d91c645ca6f127abf9b4e1ec3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 29 Apr 2022 14:32:30 -0400 Subject: [PATCH 05/43] child-variable name collisions dectected correctly --- datatree/datatree.py | 11 +++++++++++ datatree/tests/test_datatree.py | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index a68c4c6b..a4a83427 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -201,6 +201,17 @@ def ds(self, data: Union[Dataset, DataArray] = None) -> None: self._close = ds._close self._encoding = ds._encoding + def _pre_attach(self: DataTree, parent: DataTree) -> None: + """ + Method which superclass calls before setting parent, here used to prevent having two + children with duplicate names (or a data variable with the same name as a child). + """ + super()._pre_attach(parent) + if parent.has_data and self.name in list(parent.ds.variables): + raise KeyError( + f"parent {parent.name} already contains a data variable named {self.name}" + ) + def to_dataset(self) -> Dataset: """Return the data in this node as a new xarray Dataset object.""" return Dataset._construct_direct( diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 5b613118..0f9222f7 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -107,11 +107,11 @@ def test_parent_already_has_variable_with_childs_name(self): def test_assign_when_already_child_with_variables_name(self): dt = DataTree(data=None) DataTree(name="a", data=None, parent=dt) - with pytest.raises(KeyError, match="already has a child named a"): + with pytest.raises(KeyError, match="names would collide"): dt.ds = xr.Dataset({"a": 0}) dt.ds = xr.Dataset() - with pytest.raises(KeyError, match="already has a child named a"): + with pytest.raises(KeyError, match="names would collide"): dt.ds = dt.ds.assign(a=xr.DataArray(0)) @pytest.mark.xfail @@ -119,7 +119,7 @@ def test_update_when_already_child_with_variables_name(self): # See issue #38 dt = DataTree(name="root", data=None) DataTree(name="a", data=None, parent=dt) - with pytest.raises(KeyError, match="already has a child named a"): + with pytest.raises(KeyError, match="names would collide"): dt.ds["a"] = xr.DataArray(0) From 438d73a5fa3e9ef34b8865383f3144843ca0e5c3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 4 May 2022 16:39:34 -0400 Subject: [PATCH 06/43] in-progres --- datatree/datatree.py | 66 ++++++++++++++++++++++++++++++++- datatree/tests/test_datatree.py | 7 ++-- 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index a4a83427..bd61e6f1 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy from collections import OrderedDict from typing import ( TYPE_CHECKING, @@ -21,7 +22,7 @@ from xarray import DataArray, Dataset from xarray.core import utils from xarray.core.indexes import Index -from xarray.core.utils import Frozen +from xarray.core.utils import Default, Frozen, _default from xarray.core.variable import Variable from .formatting import tree_repr @@ -320,6 +321,67 @@ def __iter__(self) -> Iterator[Hashable]: def __str__(self): return tree_repr(self) + def _replace( + self, + variables: dict[Hashable, Variable] = None, + coord_names: set[Hashable] = None, + dims: dict[Any, int] = None, + attrs: dict[Hashable, Any] | None | Default = _default, + indexes: dict[Hashable, Index] = None, + encoding: dict | None | Default = _default, + parent: DataTree | None = None, + children: OrderedDict[str, Tree] = None, + inplace: bool = False, + ) -> DataTree: + """ + Fastpath constructor for internal use. + + Returns an object with optionally replaced attributes. + + Explicitly passed arguments are *not* copied when placed on the new + datatree. It is up to the caller to ensure that they have the right type + and are not used elsewhere. + """ + if inplace: + if variables is not None: + self._variables = variables + if coord_names is not None: + self._coord_names = coord_names + if dims is not None: + self._dims = dims + if attrs is not _default: + self._attrs = attrs + if indexes is not None: + self._indexes = indexes + if encoding is not _default: + self._encoding = encoding + if parent is not _default: + self._parent = parent + if children is not None: + self._children = children + obj = self + else: + if variables is None: + variables = self._variables.copy() + if coord_names is None: + coord_names = self._coord_names.copy() + if dims is None: + dims = self._dims.copy() + if attrs is _default: + attrs = copy.copy(self._attrs) + if indexes is None: + indexes = self._indexes.copy() + if encoding is _default: + encoding = copy.copy(self._encoding) + if parent is not _default: + self._parent = parent.copy() + if children is not None: + self._children = children.copy() + obj = self._construct_direct( + variables, coord_names, dims, attrs, indexes, encoding + ) + return obj + def get( self: DataTree, key: str, default: Optional[DataTree | DataArray] = None ) -> Optional[DataTree | DataArray]: @@ -381,7 +443,7 @@ def _set(self, key: str, val: DataTree | CoercibleValue) -> None: val.parent = self elif isinstance(val, (DataArray, Variable)): # TODO this should also accomodate other types that can be coerced into Variables - self.ds[key] = val + self.update({key: val}) else: raise TypeError(f"Type {type(val)} cannot be assigned to a DataTree") diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 0f9222f7..3b693c38 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -228,26 +228,27 @@ def test_setitem_dataset_on_this_node(self): data = xr.Dataset({"temp": [0, 50]}) results = DataTree(name="results") results["."] = data - assert results.ds is data + xrt.assert_identical(results.ds, data) @pytest.mark.xfail(reason="assigning Datasets doesn't yet create new nodes") def test_setitem_dataset_as_new_node(self): data = xr.Dataset({"temp": [0, 50]}) folder1 = DataTree(name="folder1") folder1["results"] = data - assert folder1["results"].ds is data + xrt.assert_identical(folder1["results"].ds, data) @pytest.mark.xfail(reason="assigning Datasets doesn't yet create new nodes") def test_setitem_dataset_as_new_node_requiring_intermediate_nodes(self): data = xr.Dataset({"temp": [0, 50]}) folder1 = DataTree(name="folder1") folder1["results/highres"] = data - assert folder1["results/highres"].ds is data + xrt.assert_identical(folder1["results/highres"].ds, data) def test_setitem_named_dataarray(self): data = xr.DataArray(name="temp", data=[0, 50]) folder1 = DataTree(name="folder1") folder1["results"] = data + print(folder1) expected = data.rename("results") xrt.assert_equal(folder1["results"], expected) From 2ca1c1a14bae192b6b64380aab73542aa1cd5425 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 20 May 2022 20:29:23 -0400 Subject: [PATCH 07/43] add _replace method --- datatree/datatree.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 47871c80..ccb84a2f 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -23,8 +23,9 @@ from xarray import DataArray, Dataset from xarray.core import utils from xarray.core.indexes import Index -from xarray.core.utils import Default, Frozen, _default +from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS +from xarray.core.utils import Default, Frozen, _default from xarray.core.variable import Variable from . import formatting, formatting_html @@ -308,9 +309,6 @@ def __contains__(self, key: object) -> bool: """ return key in self._variables or key in self.children - def __len__(self) -> int: - return len(self.ds.data_vars) - def __bool__(self) -> bool: return bool(self.ds.data_vars) @@ -497,8 +495,12 @@ def update(self, other: Dataset | Mapping[str, DataTree | DataArray]) -> None: else: raise TypeError(f"Type {type(v)} cannot be assigned to a DataTree") - super().update(new_children) - self.ds.update(new_variables) + vars_merge_result = dataset_update_method(self.to_dataset(), new_variables) + # TODO are there any subtleties with preserving order of children like this? + merged_children = OrderedDict(**self.children, **new_children) + self._replace( + inplace=True, children=merged_children, **vars_merge_result._asdict() + ) @classmethod def from_dict( From 547d1ac8b7e5c949d9c5829ecf42f0b804b50a6b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 20 May 2022 20:35:38 -0400 Subject: [PATCH 08/43] updated tests to assert identical instead of check .ds is expected_ds --- datatree/tests/test_datatree.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 304fb953..40ef1fe0 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -175,7 +175,12 @@ def test_getitem_dict_like_selection_access_to_dataset(self): class TestUpdate: - ... + def test_update_new_named_dataarray(self): + da = xr.DataArray(name="temp", data=[0, 50]) + folder1 = DataTree(name="folder1") + folder1.update({"results": da}) + expected = da.rename("results") + xrt.assert_equal(folder1["results"], expected) class TestSetItem: @@ -243,11 +248,11 @@ def test_setitem_dataset_as_new_node_requiring_intermediate_nodes(self): xrt.assert_identical(folder1["results/highres"].ds, data) def test_setitem_named_dataarray(self): - data = xr.DataArray(name="temp", data=[0, 50]) + da = xr.DataArray(name="temp", data=[0, 50]) folder1 = DataTree(name="folder1") - folder1["results"] = data - print(folder1) - expected = data.rename("results") + folder1["results"] = da + print(folder1._variables) + expected = da.rename("results") xrt.assert_equal(folder1["results"], expected) def test_setitem_unnamed_dataarray(self): @@ -290,16 +295,16 @@ def test_data_in_root(self): assert dt.name is None assert dt.parent is None assert dt.children == {} - assert dt.ds is dat + xrt.assert_identical(dt.ds, dat) def test_one_layer(self): dat1, dat2 = xr.Dataset({"a": 1}), xr.Dataset({"b": 2}) dt = DataTree.from_dict({"run1": dat1, "run2": dat2}) xrt.assert_identical(dt.ds, xr.Dataset()) assert dt.name is None - assert dt["run1"].ds is dat1 + xrt.assert_identical(dt["run1"].ds, dat1) assert dt["run1"].children == {} - assert dt["run2"].ds is dat2 + xrt.assert_identical(dt["run2"].ds, dat2) assert dt["run2"].children == {} def test_two_layers(self): @@ -308,13 +313,13 @@ def test_two_layers(self): assert "highres" in dt.children assert "lowres" in dt.children highres_run = dt["highres/run"] - assert highres_run.ds is dat1 + xrt.assert_identical(highres_run.ds, dat1) def test_nones(self): dt = DataTree.from_dict({"d": None, "d/e": None}) assert [node.name for node in dt.subtree] == [None, "d", "e"] assert [node.path for node in dt.subtree] == ["/", "/d", "/d/e"] - xrt.assert_equal(dt["d/e"].ds, xr.Dataset()) + xrt.assert_identical(dt["d/e"].ds, xr.Dataset()) def test_full(self): dt = create_test_datatree() From 6f78fcd0de890547cfc3df53d0965860434f8c90 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 20 May 2022 22:12:50 -0400 Subject: [PATCH 09/43] refactor .ds setter to use _replace --- datatree/datatree.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index ccb84a2f..dd4f4d88 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -26,7 +26,7 @@ from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS from xarray.core.utils import Default, Frozen, _default -from xarray.core.variable import Variable +from xarray.core.variable import Variable, calculate_dimensions from . import formatting, formatting_html from .mapping import TreeIsomorphismError, check_isomorphic, map_over_subtree @@ -196,14 +196,15 @@ def ds(self, data: Union[Dataset, DataArray] = None) -> None: _check_for_name_collisions(self.children, ds.variables) - # TODO this should probably be changed to use .replace, and this explicit setting of attributes reserved for constructors - self._variables = ds._variables - self._coord_names = ds._coord_names - self._dims = ds._dims - self._indexes = ds._indexes - self._attrs = ds._attrs - self._close = ds._close - self._encoding = ds._encoding + self._replace( + inplace=True, + variables=ds._variables, + coord_names=ds._coord_names, + dims=ds._dims, + indexes=ds._indexes, + attrs=ds._attrs, + encoding=ds._encoding, + ) def _pre_attach(self: DataTree, parent: DataTree) -> None: """ @@ -217,7 +218,7 @@ def _pre_attach(self: DataTree, parent: DataTree) -> None: ) def to_dataset(self) -> Dataset: - """Return the data in this node as a new xarray Dataset object.""" + """Return the data in this node as a new xarray.Dataset object.""" return Dataset._construct_direct( self._variables, self._coord_names, @@ -328,7 +329,7 @@ def _repr_html_(self): return formatting_html.datatree_repr(self) def _replace( - self, + self: DataTree, variables: dict[Hashable, Variable] = None, coord_names: set[Hashable] = None, dims: dict[Any, int] = None, @@ -336,7 +337,7 @@ def _replace( indexes: dict[Hashable, Index] = None, encoding: dict | None | Default = _default, parent: DataTree | None = None, - children: OrderedDict[str, Tree] = None, + children: OrderedDict[str, DataTree] = None, inplace: bool = False, ) -> DataTree: """ From 715ce49ff90bb69196ba7180e604790bae0fc1c8 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 21 May 2022 11:17:52 -0400 Subject: [PATCH 10/43] refactor init to use _replace --- datatree/datatree.py | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index dd4f4d88..96fa5f9b 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -26,7 +26,7 @@ from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS from xarray.core.utils import Default, Frozen, _default -from xarray.core.variable import Variable, calculate_dimensions +from xarray.core.variable import Variable from . import formatting, formatting_html from .mapping import TreeIsomorphismError, check_isomorphic, map_over_subtree @@ -58,6 +58,20 @@ T_Path = Union[str, NodePath] +def _coerce_to_dataset(data: DataArray | Dataset | None) -> Dataset: + if isinstance(data, DataArray): + ds = data.to_dataset() + elif isinstance(data, Dataset): + ds = data + elif data is None: + ds = Dataset() + else: + raise TypeError( + f"{type(data)} object is not an xarray Dataset, DataArray, or None" + ) + return ds + + def _check_for_name_collisions( children: Iterable[str], variables: Iterable[Hashable] ) -> None: @@ -152,7 +166,20 @@ def __init__( self.parent = parent # set data attributes - self.ds = data + ds = _coerce_to_dataset(data) + + _check_for_name_collisions(self.children, ds.variables) + + self._replace( + inplace=True, + variables=ds._variables, + coord_names=ds._coord_names, + dims=ds._dims, + indexes=ds._indexes, + attrs=ds._attrs, + encoding=ds._encoding, + ) + self._close = ds._close @property def name(self) -> str | None: @@ -183,16 +210,7 @@ def ds(self) -> Dataset: @ds.setter def ds(self, data: Union[Dataset, DataArray] = None) -> None: - if isinstance(data, DataArray): - ds = data.to_dataset() - elif isinstance(data, Dataset): - ds = data - elif data is None: - ds = Dataset() - else: - raise TypeError( - f"{type(data)} object is not an xarray Dataset, DataArray, or None" - ) + ds = _coerce_to_dataset(data) _check_for_name_collisions(self.children, ds.variables) @@ -336,7 +354,7 @@ def _replace( attrs: dict[Hashable, Any] | None | Default = _default, indexes: dict[Hashable, Index] = None, encoding: dict | None | Default = _default, - parent: DataTree | None = None, + parent: DataTree | None = _default, children: OrderedDict[str, DataTree] = None, inplace: bool = False, ) -> DataTree: From edd2f674e1de6e3b4ddb8068651499324a10e629 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 21 May 2022 11:23:09 -0400 Subject: [PATCH 11/43] refactor test tree to avoid init --- datatree/datatree.py | 6 +++-- datatree/tests/test_datatree.py | 45 +++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 96fa5f9b..1f691e67 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -58,7 +58,7 @@ T_Path = Union[str, NodePath] -def _coerce_to_dataset(data: DataArray | Dataset | None) -> Dataset: +def _coerce_to_dataset(data: Dataset | DataArray | None) -> Dataset: if isinstance(data, DataArray): ds = data.to_dataset() elif isinstance(data, Dataset): @@ -67,7 +67,7 @@ def _coerce_to_dataset(data: DataArray | Dataset | None) -> Dataset: ds = Dataset() else: raise TypeError( - f"{type(data)} object is not an xarray Dataset, DataArray, or None" + f"data object is not an xarray Dataset, DataArray, or None, it is of type {type(data)}" ) return ds @@ -188,6 +188,8 @@ def name(self) -> str | None: @name.setter def name(self, name: str | None) -> None: + if not isinstance(name, str) and name is not None: + raise TypeError("name must either be a string or None") self._name = name @property diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 40ef1fe0..3dbbbd22 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -40,15 +40,16 @@ def create_test_datatree(modify=lambda ds: ds): root_data = modify(xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])})) # Avoid using __init__ so we can independently test it - root = DataTree(data=root_data) - set1 = DataTree(name="set1", parent=root, data=set1_data) - DataTree(name="set1", parent=set1) - DataTree(name="set2", parent=set1) - set2 = DataTree(name="set2", parent=root, data=set2_data) - DataTree(name="set1", parent=set2) - DataTree(name="set3", parent=root) - - return root + d = { + "/": root_data, + "/set1": set1_data, + "/set1/set1": None, + "/set1/set2": None, + "/set2": set2_data, + "/set2/set1": None, + "/set3": None, + } + return DataTree.from_dict(d) class TestTreeCreation: @@ -70,6 +71,31 @@ def test_setparent_unnamed_child_node_fails(self): with pytest.raises(ValueError, match="unnamed"): DataTree(parent=john) + def test_create_two_children(self): + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": 0, "b": 1}) + + root = DataTree(data=root_data) + set1 = DataTree(name="set1", parent=root, data=set1_data) + DataTree(name="set1", parent=root) + DataTree(name="set2", parent=set1) + + def test_create_full_tree(self): + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": 0, "b": 1}) + set2_data = xr.Dataset({"a": ("x", [2, 3]), "b": ("x", [0.1, 0.2])}) + + root = DataTree(data=root_data) + set1 = DataTree(name="set1", parent=root, data=set1_data) + DataTree(name="set1", parent=set1) + DataTree(name="set2", parent=set1) + set2 = DataTree(name="set2", parent=root, data=set2_data) + DataTree(name="set1", parent=set2) + DataTree(name="set3", parent=root) + + expected = create_test_datatree() + assert root.identical(expected) + class TestStoreDatasets: def test_create_with_data(self): @@ -251,7 +277,6 @@ def test_setitem_named_dataarray(self): da = xr.DataArray(name="temp", data=[0, 50]) folder1 = DataTree(name="folder1") folder1["results"] = da - print(folder1._variables) expected = da.rename("results") xrt.assert_equal(folder1["results"], expected) From b2c51aac5df4ce45bae516df10bbf9b082914743 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 22 May 2022 16:12:33 -0400 Subject: [PATCH 12/43] attempt at copy methods --- datatree/datatree.py | 133 ++++++++++++++++++++++++++++---- datatree/tests/test_datatree.py | 75 ++++++++++++++++++ 2 files changed, 195 insertions(+), 13 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 1f691e67..e5974402 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -26,7 +26,7 @@ from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS from xarray.core.utils import Default, Frozen, _default -from xarray.core.variable import Variable +from xarray.core.variable import Variable, calculate_dimensions from . import formatting, formatting_html from .mapping import TreeIsomorphismError, check_isomorphic, map_over_subtree @@ -118,15 +118,15 @@ class DataTree( # TODO all groupby classes _name: Optional[str] - _parent: Optional[Tree] - _children: OrderedDict[str, Tree] + _parent: Optional[DataTree] + _children: OrderedDict[str, DataTree] _attrs: Optional[Dict[Hashable, Any]] _cache: Dict[str, Any] _coord_names: Set[Hashable] _dims: Dict[Hashable, int] _encoding: Optional[Dict[Hashable, Any]] _close: Optional[Callable[[], None]] - _indexes: Optional[Dict[Hashable, Index]] + _indexes: Dict[Hashable, Index] _variables: Dict[Hashable, Variable] def __init__( @@ -160,16 +160,18 @@ def __init__( DataTree.from_dict """ + # validate input + if children is None: + children = {} + ds = _coerce_to_dataset(data) + _check_for_name_collisions(children, ds.variables) + # set tree attributes super().__init__(children=children) self.name = name self.parent = parent # set data attributes - ds = _coerce_to_dataset(data) - - _check_for_name_collisions(self.children, ds.variables) - self._replace( inplace=True, variables=ds._variables, @@ -225,6 +227,7 @@ def ds(self, data: Union[Dataset, DataArray] = None) -> None: attrs=ds._attrs, encoding=ds._encoding, ) + self._close = ds._close def _pre_attach(self: DataTree, parent: DataTree) -> None: """ @@ -348,6 +351,48 @@ def _repr_html_(self): return f"
{escape(repr(self))}
" return formatting_html.datatree_repr(self) + @classmethod + def _construct_direct( + cls, + variables: dict[Any, Variable], + coord_names: set[Hashable], + dims: dict[Any, int] = None, + attrs: dict = None, + indexes: dict[Any, Index] = None, + encoding: dict = None, + name: str | None = None, + parent: DataTree | None = None, + children: OrderedDict[str, DataTree] = None, + close: Callable[[], None] = None, + ) -> DataTree: + """Shortcut around __init__ for internal use when we want to skip + costly validation + """ + + # data attributes + if dims is None: + dims = calculate_dimensions(variables) + if indexes is None: + indexes = {} + if children is None: + children = OrderedDict() + + obj: DataTree = object.__new__(cls) + obj._variables = variables + obj._coord_names = coord_names + obj._dims = dims + obj._indexes = indexes + obj._attrs = attrs + obj._close = close + obj._encoding = encoding + + # tree attributes + obj._name = name + obj._children = children + obj._parent = parent + + return obj + def _replace( self: DataTree, variables: dict[Hashable, Variable] = None, @@ -356,6 +401,7 @@ def _replace( attrs: dict[Hashable, Any] | None | Default = _default, indexes: dict[Hashable, Index] = None, encoding: dict | None | Default = _default, + name: str | None | Default = _default, parent: DataTree | None = _default, children: OrderedDict[str, DataTree] = None, inplace: bool = False, @@ -382,6 +428,8 @@ def _replace( self._indexes = indexes if encoding is not _default: self._encoding = encoding + if name is not _default: + self._name = name if parent is not _default: self._parent = parent if children is not None: @@ -400,15 +448,74 @@ def _replace( indexes = self._indexes.copy() if encoding is _default: encoding = copy.copy(self._encoding) - if parent is not _default: - self._parent = parent.copy() - if children is not None: - self._children = children.copy() + if name is _default: + name = self._name # no need to copy str objects or None + if parent is _default: + parent = copy.copy(self._parent) + if children is _default: + children = copy.copy(self._children) obj = self._construct_direct( - variables, coord_names, dims, attrs, indexes, encoding + variables, + coord_names, + dims, + attrs, + indexes, + encoding, + name, + parent, + children, ) return obj + def copy(self: DataTree, deep: bool = False, data: Mapping = None) -> DataTree: + """Returns a copy of this DataTree, including parents and children. + + If `deep=True`, a deep copy is made of each of the component variables in each node. + Otherwise, a shallow copy of each of the component variable is made, so + that the underlying memory region of the new dataset is the same as in + the original dataset. + + Use `data` to create a new tree object with the same structure as + original but entirely new data on this node. + + Parameters + ---------- + deep : bool, optional + Whether each component variable is loaded into memory and copied onto + the new object. Default is False. + data : dict-like, optional + Data to use in the new object. Each item in `data` must have same + shape as corresponding data variable in original. When `data` is + used, `deep` is ignored for the data variables and only used for + coords. + + Returns + ------- + object : DataTree + New object with dimensions, attributes, coordinates, name, encoding, + and optionally data copied from original. + """ + ds = self.to_dataset().copy(deep=deep, data=data) + parent = copy.deepcopy(self._parent) if deep else copy.copy(self._parent) + children = copy.deepcopy(self._children) if deep else copy.copy(self._children) + + # TODO should we have a "replace_data" method? + return self._replace( + ds._variables, + attrs=ds._attrs, + indexes=ds._indexes, + parent=parent, + children=children, + ) + + def __copy__(self): + return self.copy(deep=False) + + def __deepcopy__(self, memo=None) -> DataTree: + # memo does nothing but is required for compatibility with + # copy.deepcopy + return self.copy(deep=True) + def get( self: DataTree, key: str, default: Optional[DataTree | DataArray] = None ) -> Optional[DataTree | DataArray]: diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 3dbbbd22..6415f844 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -1,7 +1,12 @@ +from copy import copy, deepcopy + +import numpy as np import pytest import xarray as xr import xarray.testing as xrt +from xarray.tests import source_ndarray +import datatree.testing as dtt from datatree import DataTree @@ -209,6 +214,76 @@ def test_update_new_named_dataarray(self): xrt.assert_equal(folder1["results"], expected) +class TestCopy: + @pytest.mark.xfail(reason="bug when assigning dataarray to node") + def test_copy(self): + dt = create_test_datatree() + dt.attrs["Test"] = [1, 2, 3] + + for copied in [dt.copy(deep=False), copy(dt)]: + dtt.assert_identical(dt, copied) + + for node, copied_node in zip(dt.root.subtree, copied.root.subtree): + assert node.encoding == copied_node.encoding + # Note: IndexVariable objects with string dtype are always + # copied because of xarray.core.util.safe_cast_to_index. + # Limiting the test to data variables. + # TODO use .data_vars once that property is available + data_vars = [v for v in node.variables if v not in node._coord_names] + for k in data_vars: + v0 = node.variables[k] + v1 = copied_node.variables[k] + assert source_ndarray(v0.data) is source_ndarray(v1.data) + copied_node["foo"] = xr.DataArray(data=np.arange(5), dims="z") + assert "foo" not in node + + copied_node.attrs["foo"] = "bar" + assert "foo" not in node.attrs + assert node.attrs["Test"] is copied_node.attrs["Test"] + + @pytest.mark.xfail(reason="bug causing recursion error") + def test_deepcopy(self): + dt = create_test_datatree() + dt.attrs["Test"] = [1, 2, 3] + + for copied in [dt.copy(deep=True), deepcopy(dt)]: + dtt.assert_identical(dt, copied) + + for node, copied_node in zip(dt.root.subtree, copied.root.subtree): + assert node.encoding == copied_node.encoding + # Note: IndexVariable objects with string dtype are always + # copied because of xarray.core.util.safe_cast_to_index. + # Limiting the test to data variables. + # TODO use .data_vars once that property is available + data_vars = [v for v in node.variables if v not in node._coord_names] + for k in data_vars: + v0 = node.variables[k] + v1 = copied_node.variables[k] + assert source_ndarray(v0.data) is source_ndarray(v1.data) + copied_node["foo"] = xr.DataArray(data=np.arange(5), dims="z") + assert "foo" not in node + + copied_node.attrs["foo"] = "bar" + assert "foo" not in node.attrs + assert node.attrs["Test"] is copied_node.attrs["Test"] + + def test_copy_with_data(self): + orig = create_test_datatree() + # TODO use .data_vars once that property is available + data_vars = { + k: v for k, v in orig.variables.items() if k not in orig._coord_names + } + new_data = {k: np.random.randn(*v.shape) for k, v in data_vars.items()} + actual = orig.copy(data=new_data) + + expected = orig.copy() + for k, v in new_data.items(): + expected[k].data = v + dtt.assert_identical(expected, actual) + + # TODO test parents and children? + + class TestSetItem: def test_setitem_new_child_node(self): john = DataTree(name="john") From a20e85ffc378c7eb16fd72040a51ec1fce2329da Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 23 May 2022 17:00:33 -0400 Subject: [PATCH 13/43] rewrote implementation of .copy method --- datatree/datatree.py | 39 +++++++++++++-------------------- datatree/tests/test_datatree.py | 12 ++++++---- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index e5974402..c92b647c 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -467,46 +467,37 @@ def _replace( ) return obj - def copy(self: DataTree, deep: bool = False, data: Mapping = None) -> DataTree: - """Returns a copy of this DataTree, including parents and children. + def copy(self: DataTree, deep: bool = False) -> DataTree: + """Returns a copy of this DataTree. + + Copies all nodes in the tree, from the root down to all children in the subtree. If `deep=True`, a deep copy is made of each of the component variables in each node. Otherwise, a shallow copy of each of the component variable is made, so - that the underlying memory region of the new dataset is the same as in - the original dataset. - - Use `data` to create a new tree object with the same structure as - original but entirely new data on this node. + that the underlying memory region of the new datasets is the same as in + the original datasets. Parameters ---------- deep : bool, optional Whether each component variable is loaded into memory and copied onto the new object. Default is False. - data : dict-like, optional - Data to use in the new object. Each item in `data` must have same - shape as corresponding data variable in original. When `data` is - used, `deep` is ignored for the data variables and only used for - coords. Returns ------- object : DataTree New object with dimensions, attributes, coordinates, name, encoding, - and optionally data copied from original. + and data copied from original. """ - ds = self.to_dataset().copy(deep=deep, data=data) - parent = copy.deepcopy(self._parent) if deep else copy.copy(self._parent) - children = copy.deepcopy(self._children) if deep else copy.copy(self._children) - # TODO should we have a "replace_data" method? - return self._replace( - ds._variables, - attrs=ds._attrs, - indexes=ds._indexes, - parent=parent, - children=children, - ) + # TODO add a "data" argument like Dataset.copy has? + # TODO should "data" be a dict of paths to datasets? + + copied_from_root = {} + for node in self.root.subtree: + copied_from_root[node.path] = node.to_dataset().copy(deep=deep) + + return DataTree.from_dict(copied_from_root, name=self.root.name) def __copy__(self): return self.copy(deep=False) diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 6415f844..cdbc0708 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -215,15 +215,17 @@ def test_update_new_named_dataarray(self): class TestCopy: - @pytest.mark.xfail(reason="bug when assigning dataarray to node") def test_copy(self): dt = create_test_datatree() - dt.attrs["Test"] = [1, 2, 3] + + for node in dt.root.subtree: + node.attrs["Test"] = [1, 2, 3] for copied in [dt.copy(deep=False), copy(dt)]: dtt.assert_identical(dt, copied) for node, copied_node in zip(dt.root.subtree, copied.root.subtree): + assert node.encoding == copied_node.encoding # Note: IndexVariable objects with string dtype are always # copied because of xarray.core.util.safe_cast_to_index. @@ -241,10 +243,11 @@ def test_copy(self): assert "foo" not in node.attrs assert node.attrs["Test"] is copied_node.attrs["Test"] - @pytest.mark.xfail(reason="bug causing recursion error") def test_deepcopy(self): dt = create_test_datatree() - dt.attrs["Test"] = [1, 2, 3] + + for node in dt.root.subtree: + node.attrs["Test"] = [1, 2, 3] for copied in [dt.copy(deep=True), deepcopy(dt)]: dtt.assert_identical(dt, copied) @@ -267,6 +270,7 @@ def test_deepcopy(self): assert "foo" not in node.attrs assert node.attrs["Test"] is copied_node.attrs["Test"] + @pytest.mark.xfail(reason="data argument not yet implemented") def test_copy_with_data(self): orig = create_test_datatree() # TODO use .data_vars once that property is available From 8387a1cdb76ad2e2fa2af5448efdf641454b1dd5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 23 May 2022 17:06:24 -0400 Subject: [PATCH 14/43] xfailing test for deepcopying --- datatree/tests/test_datatree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index cdbc0708..acb05ec7 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -243,6 +243,7 @@ def test_copy(self): assert "foo" not in node.attrs assert node.attrs["Test"] is copied_node.attrs["Test"] + @pytest.mark.xfail(reason="unresolved bug with deepcopying") def test_deepcopy(self): dt = create_test_datatree() From 52ef23baaa4b6892cad2d69c61b43db831044630 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 24 May 2022 15:19:47 -0400 Subject: [PATCH 15/43] pseudocode implementation of DatasetView --- datatree/datatree.py | 58 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index c92b647c..c952c277 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -82,6 +82,50 @@ def _check_for_name_collisions( ) +class DatasetView(Dataset): + _wrapping_node: DataTree + + __slots__ = ["_wrapping_node"] + + @classmethod + def _from_node( + cls, + wrapping_node, + ) -> DatasetView: + """Constructor, using dataset attributes from wrapping node""" + + obj: DatasetView = object.__new__(cls) + obj._wrapping_node = wrapping_node + obj._variables = wrapping_node._variables + obj._coord_names = wrapping_node._coord_names + obj._dims = wrapping_node._dims + obj._indexes = wrapping_node._indexes + obj._attrs = wrapping_node._attrs + obj._close = wrapping_node._close + obj._encoding = wrapping_node._encoding + + return obj + + def __setitem__(self, key, val) -> None: + raise AttributeError( + "Mutation of the DatasetView is not allowed, please use __setitem__ on the wrapping DataTree node, " + "or use `DataTree.to_dataset()` if you want a mutable dataset" + ) + + def __getitem__(self, key) -> DataArray: + # calling the `_get_item` method of DataTree allows path-like access to contents of other nodes + obj = self._wrapping_node[key] + if isinstance(obj, DataArray): + return obj + else: + raise KeyError( + "DatasetView is only allowed to return variables, not entire DataTree nodes" + ) + + # all API that doesn't modify state in-place can just be inherited from Dataset + ... + + class DataTree( TreeNode, MappedDatasetMethodsMixin, @@ -201,15 +245,19 @@ def parent(self: DataTree) -> DataTree | None: @parent.setter def parent(self: DataTree, new_parent: DataTree) -> None: - if new_parent and self.name is None: + if new_parent is not None and self.name is None: raise ValueError("Cannot set an unnamed node as a child of another node") self._set_parent(new_parent, self.name) @property - def ds(self) -> Dataset: - """The data in this node, returned as a Dataset.""" - # TODO change this to return only an immutable view onto this node's data (see GH #80) - return self.to_dataset() + def ds(self) -> DatasetView: + """ + An immutable Dataset-like view onto the data in this node. + + If you want a mutable Dataset containing the same data as in this node, + use `.to_dataset()` instead. + """ + return DatasetView._from_node(self) @ds.setter def ds(self, data: Union[Dataset, DataArray] = None) -> None: From 4a5317eb1e96971556651553937ce9fad8c49ea5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 24 May 2022 15:27:32 -0400 Subject: [PATCH 16/43] Revert "pseudocode implementation of DatasetView" This reverts commit 52ef23baaa4b6892cad2d69c61b43db831044630. --- datatree/datatree.py | 58 ++++---------------------------------------- 1 file changed, 5 insertions(+), 53 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index c952c277..c92b647c 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -82,50 +82,6 @@ def _check_for_name_collisions( ) -class DatasetView(Dataset): - _wrapping_node: DataTree - - __slots__ = ["_wrapping_node"] - - @classmethod - def _from_node( - cls, - wrapping_node, - ) -> DatasetView: - """Constructor, using dataset attributes from wrapping node""" - - obj: DatasetView = object.__new__(cls) - obj._wrapping_node = wrapping_node - obj._variables = wrapping_node._variables - obj._coord_names = wrapping_node._coord_names - obj._dims = wrapping_node._dims - obj._indexes = wrapping_node._indexes - obj._attrs = wrapping_node._attrs - obj._close = wrapping_node._close - obj._encoding = wrapping_node._encoding - - return obj - - def __setitem__(self, key, val) -> None: - raise AttributeError( - "Mutation of the DatasetView is not allowed, please use __setitem__ on the wrapping DataTree node, " - "or use `DataTree.to_dataset()` if you want a mutable dataset" - ) - - def __getitem__(self, key) -> DataArray: - # calling the `_get_item` method of DataTree allows path-like access to contents of other nodes - obj = self._wrapping_node[key] - if isinstance(obj, DataArray): - return obj - else: - raise KeyError( - "DatasetView is only allowed to return variables, not entire DataTree nodes" - ) - - # all API that doesn't modify state in-place can just be inherited from Dataset - ... - - class DataTree( TreeNode, MappedDatasetMethodsMixin, @@ -245,19 +201,15 @@ def parent(self: DataTree) -> DataTree | None: @parent.setter def parent(self: DataTree, new_parent: DataTree) -> None: - if new_parent is not None and self.name is None: + if new_parent and self.name is None: raise ValueError("Cannot set an unnamed node as a child of another node") self._set_parent(new_parent, self.name) @property - def ds(self) -> DatasetView: - """ - An immutable Dataset-like view onto the data in this node. - - If you want a mutable Dataset containing the same data as in this node, - use `.to_dataset()` instead. - """ - return DatasetView._from_node(self) + def ds(self) -> Dataset: + """The data in this node, returned as a Dataset.""" + # TODO change this to return only an immutable view onto this node's data (see GH #80) + return self.to_dataset() @ds.setter def ds(self, data: Union[Dataset, DataArray] = None) -> None: From b60a4af319b62662db0251784c0b382faa1d28a6 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 25 May 2022 17:13:45 -0400 Subject: [PATCH 17/43] removed duplicated implementation of copy --- datatree/datatree.py | 40 ---------------------------------------- datatree/ops.py | 3 +-- 2 files changed, 1 insertion(+), 42 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index c92b647c..92a86300 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -467,46 +467,6 @@ def _replace( ) return obj - def copy(self: DataTree, deep: bool = False) -> DataTree: - """Returns a copy of this DataTree. - - Copies all nodes in the tree, from the root down to all children in the subtree. - - If `deep=True`, a deep copy is made of each of the component variables in each node. - Otherwise, a shallow copy of each of the component variable is made, so - that the underlying memory region of the new datasets is the same as in - the original datasets. - - Parameters - ---------- - deep : bool, optional - Whether each component variable is loaded into memory and copied onto - the new object. Default is False. - - Returns - ------- - object : DataTree - New object with dimensions, attributes, coordinates, name, encoding, - and data copied from original. - """ - - # TODO add a "data" argument like Dataset.copy has? - # TODO should "data" be a dict of paths to datasets? - - copied_from_root = {} - for node in self.root.subtree: - copied_from_root[node.path] = node.to_dataset().copy(deep=deep) - - return DataTree.from_dict(copied_from_root, name=self.root.name) - - def __copy__(self): - return self.copy(deep=False) - - def __deepcopy__(self, memo=None) -> DataTree: - # memo does nothing but is required for compatibility with - # copy.deepcopy - return self.copy(deep=True) - def get( self: DataTree, key: str, default: Optional[DataTree | DataArray] = None ) -> Optional[DataTree | DataArray]: diff --git a/datatree/ops.py b/datatree/ops.py index ee55ccfe..e4fa211a 100644 --- a/datatree/ops.py +++ b/datatree/ops.py @@ -30,8 +30,8 @@ "map_blocks", ] _DATASET_METHODS_TO_MAP = [ - "copy", "as_numpy", + "copy", "__copy__", "__deepcopy__", "set_coords", @@ -245,7 +245,6 @@ class MappedDataWithCoords: """ # TODO add mapped versions of groupby, weighted, rolling, rolling_exp, coarsen, resample - # TODO re-implement AttrsAccessMixin stuff so that it includes access to child nodes _wrap_then_attach_to_cls( target_cls_dict=vars(), source_cls=Dataset, From 3077bf739fc2d41b8d7dd0fcf4165ee11d574cfd Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 25 May 2022 17:15:02 -0400 Subject: [PATCH 18/43] reorganise API docs --- datatree/__init__.py | 2 +- docs/source/api.rst | 280 ++++++++++++++++++++++++++++--------------- 2 files changed, 187 insertions(+), 95 deletions(-) diff --git a/datatree/__init__.py b/datatree/__init__.py index d799dc02..58b65aec 100644 --- a/datatree/__init__.py +++ b/datatree/__init__.py @@ -6,7 +6,7 @@ # import public API from .datatree import DataTree from .io import open_datatree -from .mapping import map_over_subtree +from .mapping import TreeIsomorphismError, map_over_subtree try: __version__ = get_distribution(__name__).version diff --git a/docs/source/api.rst b/docs/source/api.rst index 5cd16466..3033df48 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -18,6 +18,8 @@ Creating a DataTree Tree Attributes --------------- +Attributes relating to the recursive tree-like structure of a ``DataTree``. + .. autosummary:: :toctree: generated/ @@ -34,34 +36,40 @@ Tree Attributes DataTree.ancestors DataTree.groups -Data Attributes ---------------- +Data Contents +------------- + +Interface to the data objects (optionally) stored inside a single ``DataTree`` node. +This interface echoes that of ``xarray.Dataset``. .. autosummary:: :toctree: generated/ DataTree.dims - DataTree.variables - DataTree.encoding DataTree.sizes + DataTree.data_vars + DataTree.coords DataTree.attrs + DataTree.encoding DataTree.indexes - DataTree.xindexes - DataTree.coords DataTree.chunks + DataTree.nbytes DataTree.ds + DataTree.to_dataset DataTree.has_data DataTree.has_attrs DataTree.is_empty .. - Missing - DataTree.chunksizes + Missing: + ``DataTree.chunksizes`` Dictionary interface -------------------- +``DataTree`` objects also have a dict-like interface mapping keys to either ``xarray.DataArray``s or to child ``DataTree`` nodes. + .. autosummary:: :toctree: generated/ @@ -73,13 +81,15 @@ Dictionary interface .. - Missing - DataTree.items - DataTree.keys - DataTree.values + Missing: + ``DataTree.items`` + ``DataTree.keys`` + ``DataTree.values`` + +Tree Manipulation +----------------- -Tree Manipulation Methods -------------------------- +For manipulating, traversing, navigating, or mapping over the tree structure. .. autosummary:: :toctree: generated/ @@ -89,123 +99,173 @@ Tree Manipulation Methods DataTree.relative_to DataTree.iter_lineage DataTree.find_common_ancestor + map_over_subtree -Tree Manipulation Utilities ---------------------------- +DataTree Contents +----------------- + +Manipulate the contents of a single DataTree node. .. autosummary:: :toctree: generated/ - map_over_subtree + DataTree.copy + DataTree.assign + DataTree.assign_coords + DataTree.merge + DataTree.rename + DataTree.rename_vars + DataTree.rename_dims + DataTree.swap_dims + DataTree.expand_dims + DataTree.drop_vars + DataTree.drop_duplicates + DataTree.drop_dims + DataTree.set_coords + DataTree.reset_coords + DataTree.convert_calendar + DataTree.interp_calendar -Methods -------- -.. +Comparisons +=========== - TODO divide these up into "Dataset contents", "Indexing", "Computation" etc. +Compare one ``DataTree`` object to another. + +.. autosummary:: + :toctree: generated/ + + DataTree.isomorphic + DataTree.equals + DataTree.identical + +Indexing +======== + +Index into each and every node of a tree. .. autosummary:: :toctree: generated/ - DataTree.load - DataTree.compute - DataTree.persist - DataTree.unify_chunks - DataTree.chunk - DataTree.map_blocks - DataTree.copy - DataTree.as_numpy - DataTree.__copy__ - DataTree.__deepcopy__ - DataTree.set_coords - DataTree.reset_coords - DataTree.info DataTree.isel DataTree.sel + DataTree.drop_sel + DataTree.drop_isel DataTree.head DataTree.tail DataTree.thin - DataTree.broadcast_like - DataTree.reindex_like - DataTree.reindex + DataTree.squeeze DataTree.interp DataTree.interp_like - DataTree.rename - DataTree.rename_dims - DataTree.rename_vars - DataTree.swap_dims - DataTree.expand_dims + DataTree.reindex + DataTree.reindex_like DataTree.set_index DataTree.reset_index DataTree.reorder_levels - DataTree.stack - DataTree.unstack - DataTree.update - DataTree.merge - DataTree.drop_vars - DataTree.drop_sel - DataTree.drop_isel - DataTree.drop_dims - DataTree.isomorphic - DataTree.equals - DataTree.identical - DataTree.transpose + DataTree.query + +.. + + Missing: + ``DataTree.loc`` + + +Missing Value Handling +====================== + +.. autosummary:: + :toctree: generated/ + + DataTree.isnull + DataTree.notnull + DataTree.combine_first DataTree.dropna DataTree.fillna - DataTree.interpolate_na DataTree.ffill DataTree.bfill - DataTree.combine_first - DataTree.reduce - DataTree.map - DataTree.assign - DataTree.diff - DataTree.shift - DataTree.roll - DataTree.sortby - DataTree.quantile - DataTree.rank - DataTree.differentiate - DataTree.integrate - DataTree.cumulative_integrate - DataTree.filter_by_attrs - DataTree.polyfit - DataTree.pad - DataTree.idxmin - DataTree.idxmax - DataTree.argmin - DataTree.argmax - DataTree.query - DataTree.curvefit - DataTree.squeeze - DataTree.clip - DataTree.assign_coords + DataTree.interpolate_na DataTree.where - DataTree.close - DataTree.isnull - DataTree.notnull DataTree.isin - DataTree.astype -Comparisons +Computation +=========== + +Apply a computation to the data in each and every node of a tree. + +.. autosummary:: + :toctree: generated/ + + Dataset.map + Dataset.reduce + Dataset.diff + Dataset.quantile + Dataset.differentiate + Dataset.integrate + Dataset.map_blocks + Dataset.polyfit + Dataset.curvefit + +Aggregation =========== +Aggregate data in each and every node, creating a new tree. + .. autosummary:: :toctree: generated/ - testing.assert_isomorphic - testing.assert_equal - testing.assert_identical + Dataset.all + Dataset.any + Dataset.argmax + Dataset.argmin + Dataset.idxmax + Dataset.idxmin + Dataset.max + Dataset.min + Dataset.mean + Dataset.median + Dataset.prod + Dataset.sum + Dataset.std + Dataset.var + Dataset.cumsum + Dataset.cumprod ndarray methods ---------------- +=============== + +Methods copied from `np.ndarray` objects, here applying to the data in each and every node of the tree. .. autosummary:: :toctree: generated/ - DataTree.nbytes - DataTree.real + DataTree.argsort + DataTree.astype + DataTree.clip + DataTree.conj + DataTree.conjugate DataTree.imag + DataTree.round + DataTree.real + DataTree.rank + +Reshaping and reorganising +========================== + +Reshape or reorganise the data in each and every node of a tree. + +.. autosummary:: + :toctree: generated/ + + DataTree.transpose + DataTree.stack + DataTree.unstack + DataTree.shift + DataTree.roll + DataTree.pad + DataTree.sortby + DataTree.broadcast_like + +Plotting +======== I/O === @@ -221,14 +281,46 @@ I/O .. - Missing - open_mfdatatree + Missing: + ``open_mfdatatree`` + +Tutorial +======== + +Testing +======= + +Test that two DataTree objects are similar. + +.. autosummary:: + :toctree: generated/ + + testing.assert_isomorphic + testing.assert_equal + testing.assert_identical Exceptions ========== +Exceptions raised when manipulating trees. + .. autosummary:: :toctree: generated/ - TreeError TreeIsomorphismError + +Advanced API +============ + +Relatively advanced API for users or developers looking to understand the internals, or extend functionality. + +.. autosummary:: + :toctree: generated/ + + DataTree.variables + +.. + + Missing: + ``DataTree.set_close`` + ``register_datatree_accessor`` From 5368f8b561fe43e0d431897a3c453d4a5210f71b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 26 May 2022 12:30:49 -0400 Subject: [PATCH 19/43] expose data_vars, coords etc. properties --- datatree/datatree.py | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 92a86300..669668c3 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -20,9 +20,12 @@ Union, ) -from xarray import DataArray, Dataset +import pandas as pd from xarray.core import utils -from xarray.core.indexes import Index +from xarray.core.coordinates import DatasetCoordinates +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset, DataVariables +from xarray.core.indexes import Index, Indexes from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS from xarray.core.utils import Default, Frozen, _default @@ -645,11 +648,35 @@ def nbytes(self) -> int: return sum(node.ds.nbytes if node.has_data else 0 for node in self.subtree) def __len__(self) -> int: - if self.children: - n_children = len(self.children) - else: - n_children = 0 - return n_children + len(self.ds) + return len(self.children) + len(self.ds) + + @property + def indexes(self) -> Indexes[pd.Index]: + """Mapping of pandas.Index objects used for label based indexing. + Raises an error if this DataTree node has indexes that cannot be coerced + to pandas.Index objects. + See Also / + -------- + DataTree.xindexes + """ + return self.xindexes.to_pandas_indexes() + + @property + def xindexes(self) -> Indexes[Index]: + """Mapping of xarray Index objects used for label based indexing.""" + return Indexes(self._indexes, {k: self._variables[k] for k in self._indexes}) + + @property + def coords(self) -> DatasetCoordinates: + """Dictionary of xarray.DataArray objects corresponding to coordinate + variables + """ + return DatasetCoordinates(self.to_dataset()) + + @property + def data_vars(self) -> DataVariables: + """Dictionary of DataArray objects corresponding to data variables""" + return DataVariables(self.to_dataset()) def isomorphic( self, From cae0a4efc70d6c68d45d4c74a5569a1fcfbb52fb Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 26 May 2022 15:00:54 -0400 Subject: [PATCH 20/43] try except with calculate_dimensions private import --- datatree/datatree.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 669668c3..0fa80afa 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -29,7 +29,7 @@ from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS from xarray.core.utils import Default, Frozen, _default -from xarray.core.variable import Variable, calculate_dimensions +from xarray.core.variable import Variable from . import formatting, formatting_html from .mapping import TreeIsomorphismError, check_isomorphic, map_over_subtree @@ -41,6 +41,12 @@ from .render import RenderTree from .treenode import NodePath, Tree, TreeNode +try: + from xarray.core.variable import calculate_dimensions +except ImportError: + # for xarray versions 2022.03.0 and earlier + from xarray.core.dataset import calculate_dimensions + if TYPE_CHECKING: from xarray.core.merge import CoercibleValue From 72af61cfc4d7b5d387841c0c16a1e6ce8ca1fe56 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 27 May 2022 11:52:37 -0400 Subject: [PATCH 21/43] add keys/values/items methods --- datatree/datatree.py | 10 ++++++---- datatree/ops.py | 1 - docs/source/api.rst | 32 ++++++++++++++++---------------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index 0fa80afa..a234f06a 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import itertools from collections import OrderedDict from html import escape from typing import ( @@ -97,6 +98,7 @@ class DataTree( MappedDataWithCoords, DataTreeArithmeticMixin, Generic[Tree], + Mapping, ): """ A tree-like hierarchical collection of xarray objects. @@ -340,13 +342,13 @@ def __contains__(self, key: object) -> bool: """The 'in' operator will return true or false depending on whether 'key' is either an array stored in the datatree or a child node, or neither. """ - return key in self._variables or key in self.children + return key in self.variables or key in self.children def __bool__(self) -> bool: - return bool(self.ds.data_vars) + return bool(self.ds.data_vars) or bool(self.children) def __iter__(self) -> Iterator[Hashable]: - return iter(self.ds.data_vars) + return itertools.chain(self.ds.data_vars, self.children) def __repr__(self) -> str: return formatting.datatree_repr(self) @@ -654,7 +656,7 @@ def nbytes(self) -> int: return sum(node.ds.nbytes if node.has_data else 0 for node in self.subtree) def __len__(self) -> int: - return len(self.children) + len(self.ds) + return len(self.children) + len(self.data_vars) @property def indexes(self) -> Indexes[pd.Index]: diff --git a/datatree/ops.py b/datatree/ops.py index e4fa211a..bdc931c9 100644 --- a/datatree/ops.py +++ b/datatree/ops.py @@ -57,7 +57,6 @@ "reorder_levels", "stack", "unstack", - "update", "merge", "drop_vars", "drop_sel", diff --git a/docs/source/api.rst b/docs/source/api.rst index 3033df48..5cc45679 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -78,13 +78,9 @@ Dictionary interface DataTree.__delitem__ DataTree.update DataTree.get - -.. - - Missing: - ``DataTree.items`` - ``DataTree.keys`` - ``DataTree.values`` + DataTree.items + DataTree.keys + DataTree.values Tree Manipulation ----------------- @@ -104,7 +100,7 @@ For manipulating, traversing, navigating, or mapping over the tree structure. DataTree Contents ----------------- -Manipulate the contents of a single DataTree node. +Manipulate the contents of all nodes in a tree simultaneously. .. autosummary:: :toctree: generated/ @@ -119,12 +115,14 @@ Manipulate the contents of a single DataTree node. DataTree.swap_dims DataTree.expand_dims DataTree.drop_vars - DataTree.drop_duplicates DataTree.drop_dims DataTree.set_coords DataTree.reset_coords - DataTree.convert_calendar - DataTree.interp_calendar + +DataTree Node Contents +---------------------- + +Manipulate the contents of a single DataTree node. Comparisons =========== @@ -141,7 +139,7 @@ Compare one ``DataTree`` object to another. Indexing ======== -Index into each and every node of a tree. +Index into all nodes in the subtree simultaneously. .. autosummary:: :toctree: generated/ @@ -189,7 +187,7 @@ Missing Value Handling Computation =========== -Apply a computation to the data in each and every node of a tree. +Apply a computation to the data in all nodes in the subtree simultaneously. .. autosummary:: :toctree: generated/ @@ -207,7 +205,7 @@ Apply a computation to the data in each and every node of a tree. Aggregation =========== -Aggregate data in each and every node, creating a new tree. +Aggregate data in all nodes in the subtree simultaneously. .. autosummary:: :toctree: generated/ @@ -232,7 +230,7 @@ Aggregate data in each and every node, creating a new tree. ndarray methods =============== -Methods copied from `np.ndarray` objects, here applying to the data in each and every node of the tree. +Methods copied from `np.ndarray` objects, here applying to the data in all nodes in the subtree. .. autosummary:: :toctree: generated/ @@ -250,7 +248,7 @@ Methods copied from `np.ndarray` objects, here applying to the data in each and Reshaping and reorganising ========================== -Reshape or reorganise the data in each and every node of a tree. +Reshape or reorganise the data in all nodes in the subtree. .. autosummary:: :toctree: generated/ @@ -270,6 +268,8 @@ Plotting I/O === +Create or + .. autosummary:: :toctree: generated/ From ec110721839bbd4175e47d57dc78b53ec03c4657 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 27 May 2022 11:58:11 -0400 Subject: [PATCH 22/43] don't use has_data when .variables would do --- datatree/datatree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index a234f06a..8dbf69f7 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -246,7 +246,7 @@ def _pre_attach(self: DataTree, parent: DataTree) -> None: children with duplicate names (or a data variable with the same name as a child). """ super()._pre_attach(parent) - if parent.has_data and self.name in list(parent.ds.variables): + if self.name in list(parent.ds.variables): raise KeyError( f"parent {parent.name} already contains a data variable named {self.name}" ) @@ -653,7 +653,7 @@ def to_dict(self) -> Dict[str, Any]: @property def nbytes(self) -> int: - return sum(node.ds.nbytes if node.has_data else 0 for node in self.subtree) + return sum(node.to_dataset().nbytes for node in self.subtree) def __len__(self) -> int: return len(self.children) + len(self.data_vars) From 7c2c4f882be8b53fc7f961e9e7a369edfde711a2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 16:16:05 -0400 Subject: [PATCH 23/43] explanation of basic properties --- docs/source/data-structures.rst | 78 +++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 docs/source/data-structures.rst diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst new file mode 100644 index 00000000..c3ed33b6 --- /dev/null +++ b/docs/source/data-structures.rst @@ -0,0 +1,78 @@ +.. _data structures: + +Data Structures +=============== + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + import datatree + + np.random.seed(123456) + np.set_printoptions(threshold=10) + +.. note:: + + This page builds on the information given in xarray's main page on + `data structures `_, so it is suggested that you + are familiar with those first. + +DataTree +-------- + +:py:class:``DataTree`` is xarray's highest-level data structure, able to organise heterogeneous data which +could not be stored inside a single ``Dataset`` object. This includes representing the recursive structure of multiple +`groups`_ within a netCDF file or `Zarr Store`_. + +.. _groups: https://www.unidata.ucar.edu/software/netcdf/workshops/2011/groups-types/GroupsIntro.html +.. _Zarr Store: https://zarr.readthedocs.io/en/stable/tutorial.html#groups + +Each ``DataTree`` object (or "node") contains the same data that a single ``xarray.Dataset`` would (i.e. ``DataArray`` objects +stored under hashable keys), and so has the same key properties: + +- ``dims``: a dictionary mapping of dimension names to lengths, for the variables in this node, +- ``data_vars``: a dict-like container of DataArrays corresponding to variables in this node, +- ``coords``: another dict-like container of DataArrays, corresponding to coordinate variables in this node, +- ``attrs``: dict to hold arbitary metadata relevant to data in this node. + +A single ``DataTree`` object acts much like a single ``Dataset`` object, and has a similar set of dict-like methods +defined upon it. However, ``DataTree``'s can also contain other ``DataTree`` objects, so they can be thought of as nested dict-like +containers of both ``xarray.DataArray``'s and ``DataTree``'s. + +A single datatree object is known as a "node", and its position relative to other nodes is defined by two more key +properties: + +- ``children``: An ordered dictionary mapping from names to other ``DataTree`` objects, known as its' "child nodes". +- ``parent``: The single ``DataTree`` object whose children this datatree is a member of, known as its' "parent node". + +Each child automatically knows about its parent node, and a node without a parent is known as a "root" node +(represented by the ``parent`` attribute pointing to ``None``). +Nodes can have multiple children, but as each child node has at most one parent, there can only ever be one root node in a given tree. + +The overall structure is technically a `connected acyclic undirected graph`, otherwise known as a +`"Tree" `_. + +.. note:: + + Technically a ``DataTree`` with more than one child node forms an `"Ordered Tree" `_, + because the children are stored in an Ordered Dictionary. However, this distinction only really matters for a few + edge cases involving operations on multiple trees simultaneously, and can safely be ignored by most users. + + +``DataTree`` objects can also optionally have a ``name`` as well as ``attrs``, just like a ``DataArray``. +Again these are not used unless explicitly accessed by the user. + + +Creating a DataTree +~~~~~~~~~~~~~~~~~~~ + +Navigating the Tree +~~~~~~~~~~~~~~~~~~~ + +Root, ancestors, parent, children, leaves, file-like access + +Mapping Operations Over the Tree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 66b7adfd169e63be6792f90b67221463e104b2ac Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 16:16:19 -0400 Subject: [PATCH 24/43] add data structures page to index --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 76ed72be..f3e12e09 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,6 +11,7 @@ Datatree Installation Quick Overview Tutorial + Data Model API Reference How do I ... Contributing Guide From b61e94038dc22d10dc3523420d58ad67c9bb1e81 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 16:19:10 -0400 Subject: [PATCH 25/43] revert adding documentation in favour of that going in a different PR --- docs/source/data-structures.rst | 78 --------------------------------- docs/source/index.rst | 1 - 2 files changed, 79 deletions(-) delete mode 100644 docs/source/data-structures.rst diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst deleted file mode 100644 index c3ed33b6..00000000 --- a/docs/source/data-structures.rst +++ /dev/null @@ -1,78 +0,0 @@ -.. _data structures: - -Data Structures -=============== - -.. ipython:: python - :suppress: - - import numpy as np - import pandas as pd - import xarray as xr - import datatree - - np.random.seed(123456) - np.set_printoptions(threshold=10) - -.. note:: - - This page builds on the information given in xarray's main page on - `data structures `_, so it is suggested that you - are familiar with those first. - -DataTree --------- - -:py:class:``DataTree`` is xarray's highest-level data structure, able to organise heterogeneous data which -could not be stored inside a single ``Dataset`` object. This includes representing the recursive structure of multiple -`groups`_ within a netCDF file or `Zarr Store`_. - -.. _groups: https://www.unidata.ucar.edu/software/netcdf/workshops/2011/groups-types/GroupsIntro.html -.. _Zarr Store: https://zarr.readthedocs.io/en/stable/tutorial.html#groups - -Each ``DataTree`` object (or "node") contains the same data that a single ``xarray.Dataset`` would (i.e. ``DataArray`` objects -stored under hashable keys), and so has the same key properties: - -- ``dims``: a dictionary mapping of dimension names to lengths, for the variables in this node, -- ``data_vars``: a dict-like container of DataArrays corresponding to variables in this node, -- ``coords``: another dict-like container of DataArrays, corresponding to coordinate variables in this node, -- ``attrs``: dict to hold arbitary metadata relevant to data in this node. - -A single ``DataTree`` object acts much like a single ``Dataset`` object, and has a similar set of dict-like methods -defined upon it. However, ``DataTree``'s can also contain other ``DataTree`` objects, so they can be thought of as nested dict-like -containers of both ``xarray.DataArray``'s and ``DataTree``'s. - -A single datatree object is known as a "node", and its position relative to other nodes is defined by two more key -properties: - -- ``children``: An ordered dictionary mapping from names to other ``DataTree`` objects, known as its' "child nodes". -- ``parent``: The single ``DataTree`` object whose children this datatree is a member of, known as its' "parent node". - -Each child automatically knows about its parent node, and a node without a parent is known as a "root" node -(represented by the ``parent`` attribute pointing to ``None``). -Nodes can have multiple children, but as each child node has at most one parent, there can only ever be one root node in a given tree. - -The overall structure is technically a `connected acyclic undirected graph`, otherwise known as a -`"Tree" `_. - -.. note:: - - Technically a ``DataTree`` with more than one child node forms an `"Ordered Tree" `_, - because the children are stored in an Ordered Dictionary. However, this distinction only really matters for a few - edge cases involving operations on multiple trees simultaneously, and can safely be ignored by most users. - - -``DataTree`` objects can also optionally have a ``name`` as well as ``attrs``, just like a ``DataArray``. -Again these are not used unless explicitly accessed by the user. - - -Creating a DataTree -~~~~~~~~~~~~~~~~~~~ - -Navigating the Tree -~~~~~~~~~~~~~~~~~~~ - -Root, ancestors, parent, children, leaves, file-like access - -Mapping Operations Over the Tree -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/index.rst b/docs/source/index.rst index f3e12e09..76ed72be 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,7 +11,6 @@ Datatree Installation Quick Overview Tutorial - Data Model API Reference How do I ... Contributing Guide From 163e54d68913921b169bee0028d3e857c60bb5e5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 16:16:05 -0400 Subject: [PATCH 26/43] explanation of basic properties --- docs/source/data-structures.rst | 78 +++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 docs/source/data-structures.rst diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst new file mode 100644 index 00000000..c3ed33b6 --- /dev/null +++ b/docs/source/data-structures.rst @@ -0,0 +1,78 @@ +.. _data structures: + +Data Structures +=============== + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + import datatree + + np.random.seed(123456) + np.set_printoptions(threshold=10) + +.. note:: + + This page builds on the information given in xarray's main page on + `data structures `_, so it is suggested that you + are familiar with those first. + +DataTree +-------- + +:py:class:``DataTree`` is xarray's highest-level data structure, able to organise heterogeneous data which +could not be stored inside a single ``Dataset`` object. This includes representing the recursive structure of multiple +`groups`_ within a netCDF file or `Zarr Store`_. + +.. _groups: https://www.unidata.ucar.edu/software/netcdf/workshops/2011/groups-types/GroupsIntro.html +.. _Zarr Store: https://zarr.readthedocs.io/en/stable/tutorial.html#groups + +Each ``DataTree`` object (or "node") contains the same data that a single ``xarray.Dataset`` would (i.e. ``DataArray`` objects +stored under hashable keys), and so has the same key properties: + +- ``dims``: a dictionary mapping of dimension names to lengths, for the variables in this node, +- ``data_vars``: a dict-like container of DataArrays corresponding to variables in this node, +- ``coords``: another dict-like container of DataArrays, corresponding to coordinate variables in this node, +- ``attrs``: dict to hold arbitary metadata relevant to data in this node. + +A single ``DataTree`` object acts much like a single ``Dataset`` object, and has a similar set of dict-like methods +defined upon it. However, ``DataTree``'s can also contain other ``DataTree`` objects, so they can be thought of as nested dict-like +containers of both ``xarray.DataArray``'s and ``DataTree``'s. + +A single datatree object is known as a "node", and its position relative to other nodes is defined by two more key +properties: + +- ``children``: An ordered dictionary mapping from names to other ``DataTree`` objects, known as its' "child nodes". +- ``parent``: The single ``DataTree`` object whose children this datatree is a member of, known as its' "parent node". + +Each child automatically knows about its parent node, and a node without a parent is known as a "root" node +(represented by the ``parent`` attribute pointing to ``None``). +Nodes can have multiple children, but as each child node has at most one parent, there can only ever be one root node in a given tree. + +The overall structure is technically a `connected acyclic undirected graph`, otherwise known as a +`"Tree" `_. + +.. note:: + + Technically a ``DataTree`` with more than one child node forms an `"Ordered Tree" `_, + because the children are stored in an Ordered Dictionary. However, this distinction only really matters for a few + edge cases involving operations on multiple trees simultaneously, and can safely be ignored by most users. + + +``DataTree`` objects can also optionally have a ``name`` as well as ``attrs``, just like a ``DataArray``. +Again these are not used unless explicitly accessed by the user. + + +Creating a DataTree +~~~~~~~~~~~~~~~~~~~ + +Navigating the Tree +~~~~~~~~~~~~~~~~~~~ + +Root, ancestors, parent, children, leaves, file-like access + +Mapping Operations Over the Tree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From ab0dfe14e672b06b67d678de2820cda877f8f221 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 16:16:19 -0400 Subject: [PATCH 27/43] add data structures page to index --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 76ed72be..f3e12e09 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,6 +11,7 @@ Datatree Installation Quick Overview Tutorial + Data Model API Reference How do I ... Contributing Guide From 5c36b18aa7b56811f12d904a4f5840c9ba90c49e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 20:25:25 -0400 Subject: [PATCH 28/43] create tree node-by-node --- docs/source/data-structures.rst | 82 ++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index c3ed33b6..133f87f3 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -52,7 +52,7 @@ Each child automatically knows about its parent node, and a node without a paren (represented by the ``parent`` attribute pointing to ``None``). Nodes can have multiple children, but as each child node has at most one parent, there can only ever be one root node in a given tree. -The overall structure is technically a `connected acyclic undirected graph`, otherwise known as a +The overall structure is technically a `connected acyclic undirected rooted graph`, otherwise known as a `"Tree" `_. .. note:: @@ -63,12 +63,90 @@ The overall structure is technically a `connected acyclic undirected graph`, oth ``DataTree`` objects can also optionally have a ``name`` as well as ``attrs``, just like a ``DataArray``. -Again these are not used unless explicitly accessed by the user. +Again these are not normally used unless explicitly accessed by the user. Creating a DataTree ~~~~~~~~~~~~~~~~~~~ +There are two ways to create a ``DataTree`` from scratch. The first is to create each node individually, +specifying the nodes' relationship to one another as you create each one. + +The ``DataTree`` constructor takes: + +- ``data``: The data that will be stored in this node, represented by a single ``xarray.Dataset``, or a named ``xarray.DataArray``. +- ``parent``: The parent node (if there is one), given as a ``DataTree`` object. +- ``children``: The various child nodes (if there are any), given as a mapping from string keys to ``DataTree`` objects. +- ``name``: A string to use as the name of this node. + +Let's make a datatree node without anything in it: + +.. ipython:: python + + from datatree import DataTree + + # create root node + node1 = DataTree(name="Oak") + + node1 + +At this point our node is also the root node, as every tree has a root node. + +We can add a second node to this tree either by referring to the first node in the constructor of the second: + +.. ipython:: python + + # add a child by referring to the parent node + node2 = DataTree(name="Little Bonsai", parent=node1) + +or by dynamically updating the attributes of one node to refer to another: + +.. ipython:: python + + # add a grandparent by updating the .parent property of an existing node + node0 = DataTree(name="General Sherman") + node1.parent = node0 + +Our tree now has a new root: + +.. ipython:: python + + node2.root + +Is is at tree construction time that consistency checks are enforced. For instance, if we try to create a `cycle` the constructor will raise an error: + +.. ipython:: python + :okexcept: + + node0.parent = node2 + +The second way is to build the tree from a dictionary of filesystem-like paths and corresponding ``xarray.Dataset` objects. +This relies on + + + + +If you have a file containing data on disk (such as a netCDF file or a Zarr Store), you can also create a datatree by opening the +file using ``:py:func::~datatree.open_datatree``. + + +DataTree Contents +~~~~~~~~~~~~~~~~~ + +Now let's add some data to our tree. + +.. ipython:: python + + # create some data + ds1 = xr.Dataset({"a": ("x", [1, 2, 3])}) + + ds1 + + +You can see that the data in the node is displayed in the same way that the contents of the xarray Dataset we added would have been. + + + Navigating the Tree ~~~~~~~~~~~~~~~~~~~ From c75fb0b2f5765e5b206f2c861363fd75d07871a7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 31 May 2022 20:51:20 -0400 Subject: [PATCH 29/43] create tree from dict --- docs/source/data-structures.rst | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 133f87f3..6260adf5 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -97,7 +97,7 @@ We can add a second node to this tree either by referring to the first node in t .. ipython:: python # add a child by referring to the parent node - node2 = DataTree(name="Little Bonsai", parent=node1) + node2 = DataTree(name="Bonsai", parent=node1) or by dynamically updating the attributes of one node to refer to another: @@ -107,11 +107,11 @@ or by dynamically updating the attributes of one node to refer to another: node0 = DataTree(name="General Sherman") node1.parent = node0 -Our tree now has a new root: +Our tree now has three nodes within it, and one of the two new nodes has become the new root: .. ipython:: python - node2.root + node0 Is is at tree construction time that consistency checks are enforced. For instance, if we try to create a `cycle` the constructor will raise an error: @@ -120,13 +120,29 @@ Is is at tree construction time that consistency checks are enforced. For instan node0.parent = node2 -The second way is to build the tree from a dictionary of filesystem-like paths and corresponding ``xarray.Dataset` objects. -This relies on +The second way is to build the tree from a dictionary of filesystem-like paths and corresponding ``xarray.Dataset`` objects. +This relies on a syntax inspired by unix-like filesystems, where the "path" to a node is specified by the keys of each intermediate node in sequence, +separated by forward slashes. The root node is referred to by ``"/"``, so the path from our current root node to its grand-child would be ``"/Oak/Bonsai"``. +If we have a dictionary where each key is a valid path, and each value is either valid data or ``None``, +we can construct a complex tree quickly using the alternative constructor ``:py:func::DataTree.from_dict``: +.. ipython:: python + + d = { + "/": None, + "/a": xr.Dataset({"foo": 0}), + "/a/b": xr.Dataset({"bar": ("y", [0, 1, 2])}), + "a/c/d": None, + } + dt = DataTree.from_dict(d) + dt -If you have a file containing data on disk (such as a netCDF file or a Zarr Store), you can also create a datatree by opening the +Notice that this method will also create any intermediate empty node necessary to reach the end of the specified path +(i.e. the node labelled `"c"` in this case.) + +Finally if you have a file containing data on disk (such as a netCDF file or a Zarr Store), you can also create a datatree by opening the file using ``:py:func::~datatree.open_datatree``. @@ -139,6 +155,7 @@ Now let's add some data to our tree. # create some data ds1 = xr.Dataset({"a": ("x", [1, 2, 3])}) + ds1 ds1 From a59ff547a946c7a6e97588dc723444c207d9f64f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 Jun 2022 00:52:08 +0000 Subject: [PATCH 30/43] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/source/data-structures.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 6260adf5..4523b966 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -130,14 +130,14 @@ we can construct a complex tree quickly using the alternative constructor ``:py: .. ipython:: python - d = { - "/": None, - "/a": xr.Dataset({"foo": 0}), - "/a/b": xr.Dataset({"bar": ("y", [0, 1, 2])}), - "a/c/d": None, - } - dt = DataTree.from_dict(d) - dt + d = { + "/": None, + "/a": xr.Dataset({"foo": 0}), + "/a/b": xr.Dataset({"bar": ("y", [0, 1, 2])}), + "a/c/d": None, + } + dt = DataTree.from_dict(d) + dt Notice that this method will also create any intermediate empty node necessary to reach the end of the specified path (i.e. the node labelled `"c"` in this case.) From 91c7afdb967749def4ca315de38fb4571aa85b1c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 1 Jun 2022 09:50:46 -0400 Subject: [PATCH 31/43] dict-like interface --- docs/source/data-structures.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 6260adf5..e1492f23 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -149,6 +149,15 @@ file using ``:py:func::~datatree.open_datatree``. DataTree Contents ~~~~~~~~~~~~~~~~~ +Like ``xarray.Dataset``, ``DataTree`` implements the python mapping interface, but with values given by either ``xarray.DataArray`` objects or other ``DataTree`` objects. + +.. ipython:: python + + dt['a'] + + +Iterating over keys will iterate over both the names of variables and child nodes. + Now let's add some data to our tree. .. ipython:: python @@ -162,7 +171,13 @@ Now let's add some data to our tree. You can see that the data in the node is displayed in the same way that the contents of the xarray Dataset we added would have been. +Dictionary-like methods +~~~~~~~~~~~~~~~~~~~~~~~ + +We can update the contents of the tree in-place using a dictionary-like syntax. +If you copy a ``DataTree`` using the ``:py:func::copy`` method it will copy the entire tree, including all parents and children. +Like for ``Dataset``, this copy is shallow by default. Navigating the Tree ~~~~~~~~~~~~~~~~~~~ From 0e9b38408846aa32249779eb7bb0b4d14558be96 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 1 Jun 2022 10:03:38 -0400 Subject: [PATCH 32/43] correct deepcopy tests --- datatree/tests/test_datatree.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index acb05ec7..c44f9348 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -243,7 +243,6 @@ def test_copy(self): assert "foo" not in node.attrs assert node.attrs["Test"] is copied_node.attrs["Test"] - @pytest.mark.xfail(reason="unresolved bug with deepcopying") def test_deepcopy(self): dt = create_test_datatree() @@ -263,13 +262,13 @@ def test_deepcopy(self): for k in data_vars: v0 = node.variables[k] v1 = copied_node.variables[k] - assert source_ndarray(v0.data) is source_ndarray(v1.data) + assert source_ndarray(v0.data) is not source_ndarray(v1.data) copied_node["foo"] = xr.DataArray(data=np.arange(5), dims="z") assert "foo" not in node copied_node.attrs["foo"] = "bar" assert "foo" not in node.attrs - assert node.attrs["Test"] is copied_node.attrs["Test"] + assert node.attrs["Test"] is not copied_node.attrs["Test"] @pytest.mark.xfail(reason="data argument not yet implemented") def test_copy_with_data(self): From d56f89bfadc91ec5514b64fb9fae53e863ab1b56 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 1 Jun 2022 10:18:13 -0400 Subject: [PATCH 33/43] use .data_vars in copy tests --- datatree/tests/test_datatree.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index c44f9348..7da984c2 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -230,9 +230,7 @@ def test_copy(self): # Note: IndexVariable objects with string dtype are always # copied because of xarray.core.util.safe_cast_to_index. # Limiting the test to data variables. - # TODO use .data_vars once that property is available - data_vars = [v for v in node.variables if v not in node._coord_names] - for k in data_vars: + for k in node.data_vars: v0 = node.variables[k] v1 = copied_node.variables[k] assert source_ndarray(v0.data) is source_ndarray(v1.data) @@ -257,9 +255,7 @@ def test_deepcopy(self): # Note: IndexVariable objects with string dtype are always # copied because of xarray.core.util.safe_cast_to_index. # Limiting the test to data variables. - # TODO use .data_vars once that property is available - data_vars = [v for v in node.variables if v not in node._coord_names] - for k in data_vars: + for k in node.data_vars: v0 = node.variables[k] v1 = copied_node.variables[k] assert source_ndarray(v0.data) is not source_ndarray(v1.data) From e1916607605f28ce9bed8693bbf0f8a2e7d3e49c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Jun 2022 19:05:18 +0000 Subject: [PATCH 34/43] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/source/data-structures.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 4a7b8431..86658610 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -153,7 +153,7 @@ Like ``xarray.Dataset``, ``DataTree`` implements the python mapping interface, b .. ipython:: python - dt['a'] + dt["a"] Iterating over keys will iterate over both the names of variables and child nodes. From 0c1dd29e81a7e82a16eb3386a6d5e0cbd7156b13 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 2 Jun 2022 15:05:52 -0400 Subject: [PATCH 35/43] black --- docs/source/data-structures.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 4a7b8431..86658610 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -153,7 +153,7 @@ Like ``xarray.Dataset``, ``DataTree`` implements the python mapping interface, b .. ipython:: python - dt['a'] + dt["a"] Iterating over keys will iterate over both the names of variables and child nodes. From 3ce120b3e4039ccd4a2be79a39e0b24f1f98d5df Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Jun 2022 13:15:11 -0400 Subject: [PATCH 36/43] whatsnew --- docs/source/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/whats-new.rst b/docs/source/whats-new.rst index 8a31940f..160d45bf 100644 --- a/docs/source/whats-new.rst +++ b/docs/source/whats-new.rst @@ -46,6 +46,9 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- Added ``Data Structures`` page describing the internal structure of a ``DataTree`` object, and its relation to + ``xarray.Dataset`` objects. (:pull:`103`) + By `Tom Nicholas `_. - API page updated with all the methods that are copied from ``xarray.Dataset``. (:pull:`41`) By `Tom Nicholas `_. From f87ef2fb82be2e69d9453fcdb158aafd7bb2a659 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Jun 2022 17:41:16 -0400 Subject: [PATCH 37/43] data contents --- docs/source/data-structures.rst | 37 +++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 86658610..608ae87b 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -131,9 +131,9 @@ we can construct a complex tree quickly using the alternative constructor ``:py: .. ipython:: python d = { - "/": None, - "/a": xr.Dataset({"foo": 0}), - "/a/b": xr.Dataset({"bar": ("y", [0, 1, 2])}), + "/": xr.Dataset({"foo": "orange"}), + "/a": xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])}), + "/a/b": xr.Dataset({"zed": np.NaN}), "a/c/d": None, } dt = DataTree.from_dict(d) @@ -154,22 +154,31 @@ Like ``xarray.Dataset``, ``DataTree`` implements the python mapping interface, b .. ipython:: python dt["a"] - + dt["foo"] Iterating over keys will iterate over both the names of variables and child nodes. -Now let's add some data to our tree. +We can also access all the data in a single node through a dataset-like view + +.. ipython:: python + + dt["a"].ds + +This demonstrates the fact that the data in any one node is equivalent to the contents of a single ``xarray.Dataset`` object. +The ``DataTree.ds`` property returns an immutable view, but we can instead extract the node's data contents as a new (and mutable) +``xarray.Dataset`` object via ``.to_dataset()``: .. ipython:: python - # create some data - ds1 = xr.Dataset({"a": ("x", [1, 2, 3])}) - ds1 + dt["a"].to_dataset() - ds1 +Like with ``Dataset``, you can access the data and coordinate variables of a node separately via the ``data_vars`` and ``coords`` attributes: +.. ipython:: python + + dt["a"].data_vars + dt["a"].coords -You can see that the data in the node is displayed in the same way that the contents of the xarray Dataset we added would have been. Dictionary-like methods ~~~~~~~~~~~~~~~~~~~~~~~ @@ -178,11 +187,3 @@ We can update the contents of the tree in-place using a dictionary-like syntax. If you copy a ``DataTree`` using the ``:py:func::copy`` method it will copy the entire tree, including all parents and children. Like for ``Dataset``, this copy is shallow by default. - -Navigating the Tree -~~~~~~~~~~~~~~~~~~~ - -Root, ancestors, parent, children, leaves, file-like access - -Mapping Operations Over the Tree -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 44b8db579d9467518251da8b93ed3e37bedd9240 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Jun 2022 18:13:31 -0400 Subject: [PATCH 38/43] dictionary-like access --- docs/source/data-structures.rst | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 608ae87b..56526798 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -185,5 +185,26 @@ Dictionary-like methods We can update the contents of the tree in-place using a dictionary-like syntax. -If you copy a ``DataTree`` using the ``:py:func::copy`` method it will copy the entire tree, including all parents and children. -Like for ``Dataset``, this copy is shallow by default. +We can update a datatree in-place using Python's standard dictionary syntax, similar to how we can for Dataset objects. +For example, to create this example datatree from scratch, we could have written: + +# TODO update this example using ``.coords`` and ``.data_vars`` as setters, and allowing non-dataarray values to ``__setitem__`` + +.. ipython:: python + + dt = DataTree() + dt["foo"] = xr.DataArray("orange") + dt["a"] = DataTree(data=xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})) + dt["a/b/zed"] = xr.DataArray(np.NaN) + dt["a/c/d"] = DataTree() + dt + +To change the variables in a node of a ``DataTree``, you can use all the standard dictionary +methods, including ``values``, ``items``, ``__delitem__``, ``get`` and +:py:meth:`~xarray.DataTree.update`. +Note that assigning a ``DataArray`` object to a ``DataTree`` variable using ``__setitem__`` or ``update`` will +:ref:`automatically align` the array(s) to the original node's indexes. + +If you copy a ``DataTree`` using the ``:py:func::copy`` function or the :py:meth:`~xarray.DataTree.copy` it will copy the entire tree, +including all parents and children. +Like for ``Dataset``, this copy is shallow by default, but you can copy all the data by calling ``dt.copy(deep=True)``. From 02f63a275074c6e2d4a66757a2dbf8e3dc36ecc4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Jun 2022 18:41:25 -0400 Subject: [PATCH 39/43] TODOs --- docs/source/data-structures.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index 56526798..fac4bfe0 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -188,7 +188,8 @@ We can update the contents of the tree in-place using a dictionary-like syntax. We can update a datatree in-place using Python's standard dictionary syntax, similar to how we can for Dataset objects. For example, to create this example datatree from scratch, we could have written: -# TODO update this example using ``.coords`` and ``.data_vars`` as setters, and allowing non-dataarray values to ``__setitem__`` +# TODO update this example allowing non-dataarray values to ``__setitem__`` +# TODO update this example using ``.coords`` and ``.data_vars`` as setters, .. ipython:: python From b74b94feee672d7b71f54c552ebb0e579e61cf7c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 17 Jun 2022 13:32:37 -0400 Subject: [PATCH 40/43] test assigning int --- datatree/tests/test_datatree.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index 2de85311..d571d78b 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -314,6 +314,17 @@ def test_setitem_unnamed_dataarray(self): folder1["results"] = data xrt.assert_equal(folder1["results"], data) + def test_setitem_variable(self): + var = xr.Variable(data=[0, 50], dims="x") + folder1 = DataTree(name="folder1") + folder1["results"] = var + xrt.assert_equal(folder1["results"], xr.DataArray(var)) + + def test_setitem_coerce_to_dataarray(self): + folder1 = DataTree(name="folder1") + folder1["results"] = 0 + xrt.assert_equal(folder1["results"], xr.DataArray(0)) + def test_setitem_add_new_variable_to_empty_node(self): results = DataTree(name="results") results["pressure"] = xr.DataArray(data=[2, 3]) From 86f218b4093b783d472e2dc1cb9aaf87added8f4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 17 Jun 2022 13:32:57 -0400 Subject: [PATCH 41/43] allow assigning coercible values --- datatree/datatree.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/datatree/datatree.py b/datatree/datatree.py index ebeab5a1..8e62f1c8 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -699,14 +699,17 @@ def _set(self, key: str, val: DataTree | CoercibleValue) -> None: if isinstance(val, DataTree): val.name = key val.parent = self - elif isinstance(val, (DataArray, Variable)): - # TODO this should also accomodate other types that can be coerced into Variables - self.update({key: val}) else: - raise TypeError(f"Type {type(val)} cannot be assigned to a DataTree") + if not isinstance(val, (DataArray, Variable)): + # accommodate other types that can be coerced into Variables + val = DataArray(val) + + self.update({key: val}) def __setitem__( - self, key: str, value: DataTree | Dataset | DataArray | Variable + self, + key: str, + value: Any, ) -> None: """ Add either a child node or an array to the tree, at any position. From 657d5c9b179dd3f7ec67cf07f048ec627a087f93 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 17 Jun 2022 16:47:17 -0400 Subject: [PATCH 42/43] simplify example using #115 --- docs/source/data-structures.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index fac4bfe0..cb3227da 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -188,15 +188,14 @@ We can update the contents of the tree in-place using a dictionary-like syntax. We can update a datatree in-place using Python's standard dictionary syntax, similar to how we can for Dataset objects. For example, to create this example datatree from scratch, we could have written: -# TODO update this example allowing non-dataarray values to ``__setitem__`` # TODO update this example using ``.coords`` and ``.data_vars`` as setters, .. ipython:: python dt = DataTree() - dt["foo"] = xr.DataArray("orange") + dt["foo"] = "orange" dt["a"] = DataTree(data=xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})) - dt["a/b/zed"] = xr.DataArray(np.NaN) + dt["a/b/zed"] = np.NaN dt["a/c/d"] = DataTree() dt From 566ca1a34e6589668be6151c9b398c8a28db8e5e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 26 Jun 2022 13:30:34 +0100 Subject: [PATCH 43/43] add note about fully qualified names --- docs/source/data-structures.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/data-structures.rst b/docs/source/data-structures.rst index cb3227da..93d5b9ab 100644 --- a/docs/source/data-structures.rst +++ b/docs/source/data-structures.rst @@ -124,6 +124,8 @@ The second way is to build the tree from a dictionary of filesystem-like paths a This relies on a syntax inspired by unix-like filesystems, where the "path" to a node is specified by the keys of each intermediate node in sequence, separated by forward slashes. The root node is referred to by ``"/"``, so the path from our current root node to its grand-child would be ``"/Oak/Bonsai"``. +A path specified from the root (as opposed to being specified relative to an arbitrary node in the tree) is sometimes also referred to as a +`"fully qualified name" `_. If we have a dictionary where each key is a valid path, and each value is either valid data or ``None``, we can construct a complex tree quickly using the alternative constructor ``:py:func::DataTree.from_dict``: