Skip to content

Commit 3f5757a

Browse files
skshetryefiop
authored andcommitted
dvc.objects: move hashing functions to dvc.objects
This commit moves `HashedStreamReader`/`get_hash_file`/`file_md5` into `dvc.objects.hash`. The `get_hash_file` has been renamed to `hash_file`. This also copies `relpath` and `is_exec` to the `dvc.fs.utils`.
1 parent 7a795f7 commit 3f5757a

25 files changed

+257
-167
lines changed

dvc/data/stage.py

Lines changed: 5 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
from dvc.exceptions import DvcIgnoreInCollectedDirError
1010
from dvc.ignore import DvcIgnore
1111
from dvc.objects.file import HashFile
12+
from dvc.objects.hash import hash_file
1213
from dvc.objects.hash_info import HashInfo
1314
from dvc.objects.meta import Meta
1415
from dvc.progress import Tqdm
15-
from dvc.utils import file_md5, is_exec
1616

1717
from .db.reference import ReferenceObjectDB
1818

@@ -32,7 +32,8 @@
3232
def _upload_file(from_fs_path, fs, odb, upload_odb, callback=None):
3333
from dvc.fs._callback import FsspecCallback
3434
from dvc.utils import tmp_fname
35-
from dvc.utils.stream import HashedStreamReader
35+
36+
from .stream import HashedStreamReader
3637

3738
fs_path = upload_odb.fs.path
3839
tmp_info = fs_path.join(upload_odb.fs_path, tmp_fname())
@@ -52,59 +53,9 @@ def _upload_file(from_fs_path, fs, odb, upload_odb, callback=None):
5253
return from_fs_path, meta, odb.get(stream.hash_info)
5354

5455

55-
def _adapt_info(info, scheme):
56-
if scheme == "s3" and "ETag" in info:
57-
info["etag"] = info["ETag"].strip('"')
58-
elif scheme == "gs" and "etag" in info:
59-
import base64
60-
61-
info["etag"] = base64.b64decode(info["etag"]).hex()
62-
elif scheme.startswith("http") and (
63-
"ETag" in info or "Content-MD5" in info
64-
):
65-
info["checksum"] = info.get("ETag") or info.get("Content-MD5")
66-
return info
67-
68-
69-
def _get_file_hash(fs_path, fs, name):
70-
info = _adapt_info(fs.info(fs_path), fs.scheme)
71-
72-
if name in info:
73-
assert not info[name].endswith(".dir")
74-
hash_value = info[name]
75-
elif hasattr(fs, name):
76-
func = getattr(fs, name)
77-
hash_value = func(fs_path)
78-
elif name == "md5":
79-
hash_value = file_md5(fs_path, fs)
80-
else:
81-
raise NotImplementedError
82-
83-
meta = Meta(size=info["size"], isexec=is_exec(info.get("mode", 0)))
84-
hash_info = HashInfo(name, hash_value)
85-
return meta, hash_info
86-
87-
88-
def get_file_hash(fs_path, fs, name, state=None):
89-
if state:
90-
meta, hash_info = state.get( # pylint: disable=assignment-from-none
91-
fs_path, fs
92-
)
93-
if hash_info:
94-
return meta, hash_info
95-
96-
meta, hash_info = _get_file_hash(fs_path, fs, name)
97-
98-
if state:
99-
assert ".dir" not in hash_info.value
100-
state.save(fs_path, fs, hash_info)
101-
102-
return meta, hash_info
103-
104-
10556
def _stage_file(fs_path, fs, name, odb=None, upload_odb=None, dry_run=False):
10657
state = odb.state if odb else None
107-
meta, hash_info = get_file_hash(fs_path, fs, name, state=state)
58+
meta, hash_info = hash_file(fs_path, fs, name, state=state)
10859
if upload_odb and not dry_run:
10960
assert odb and name == "md5"
11061
return _upload_file(fs_path, fs, odb, upload_odb)
@@ -346,7 +297,7 @@ def _stage_external_tree_info(odb, tree, name):
346297

347298
odb.add(tree.fs_path, tree.fs, tree.hash_info)
348299
raw = odb.get(tree.hash_info)
349-
_, hash_info = get_file_hash(raw.fs_path, raw.fs, name, state=odb.state)
300+
_, hash_info = hash_file(raw.fs_path, raw.fs, name, state=odb.state)
350301
tree.fs_path = raw.fs_path
351302
tree.fs = raw.fs
352303
tree.hash_info.name = hash_info.name

dvc/utils/stream.py renamed to dvc/data/stream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
from funcy import cached_property
55

6+
from dvc.objects.hash import dos2unix
67
from dvc.objects.hash_info import HashInfo
78
from dvc.objects.istextfile import DEFAULT_CHUNK_SIZE, istextblock
8-
from dvc.utils import dos2unix
99

1010

1111
class HashedStreamReader(io.IOBase):

dvc/data/tree.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77

88
from dvc.objects.errors import ObjectFormatError
99
from dvc.objects.file import HashFile
10-
11-
from .stage import get_file_hash
10+
from dvc.objects.hash import hash_file
1211

1312
if TYPE_CHECKING:
1413
from dvc.objects.db import ObjectDB
@@ -67,7 +66,7 @@ def digest(self, hash_info: Optional["HashInfo"] = None):
6766
if hash_info:
6867
self.hash_info = hash_info
6968
else:
70-
_, self.hash_info = get_file_hash(fs_path, memfs, "md5")
69+
_, self.hash_info = hash_file(fs_path, memfs, "md5")
7170
assert self.hash_info.value
7271
self.hash_info.value += ".dir"
7372

dvc/fs/_callback.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from contextlib import ExitStack
22
from functools import wraps
3-
from typing import IO, TYPE_CHECKING, Any, Dict, Optional, TypeVar, cast
3+
from typing import TYPE_CHECKING, Any, Dict, Optional, TypeVar, overload
44

55
import fsspec
66
from funcy import cached_property
77

88
if TYPE_CHECKING:
9-
from typing import Callable
9+
from typing import BinaryIO, Callable, TextIO, Union
1010

1111
from typing_extensions import ParamSpec
1212

@@ -20,11 +20,21 @@
2020
class FsspecCallback(fsspec.Callback):
2121
"""FsspecCallback usable as a context manager, and a few helper methods."""
2222

23-
def wrap_attr(self, fobj: IO, method: str = "read") -> IO:
23+
@overload
24+
def wrap_attr(self, fobj: "BinaryIO", method: str = "read") -> "BinaryIO":
25+
...
26+
27+
@overload
28+
def wrap_attr(self, fobj: "TextIO", method: str = "read") -> "TextIO":
29+
...
30+
31+
def wrap_attr(
32+
self, fobj: "Union[TextIO, BinaryIO]", method: str = "read"
33+
) -> "Union[TextIO, BinaryIO]":
2434
from tqdm.utils import CallbackIOWrapper
2535

2636
wrapped = CallbackIOWrapper(self.relative_update, fobj, method)
27-
return cast(IO, wrapped)
37+
return wrapped
2838

2939
def wrap_fn(self, fn: "Callable[_P, _R]") -> "Callable[_P, _R]":
3040
@wraps(fn)

dvc/fs/base.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from ._callback import DEFAULT_CALLBACK, FsspecCallback
2626

2727
if TYPE_CHECKING:
28+
from typing import BinaryIO, TextIO
29+
2830
from fsspec.spec import AbstractFileSystem
2931
from typing_extensions import Literal
3032

@@ -179,12 +181,30 @@ def is_empty(self, path: AnyFSPath) -> bool:
179181
return not self.fs.ls(path)
180182
return entry["size"] == 0
181183

184+
@overload
185+
def open(
186+
self,
187+
path: AnyFSPath,
188+
mode: "Literal['rb', 'br', 'wb']",
189+
**kwargs: Any,
190+
) -> "BinaryIO": # pylint: disable=arguments-differ
191+
return self.open(path, mode, **kwargs)
192+
193+
@overload
194+
def open(
195+
self,
196+
path: AnyFSPath,
197+
mode: "Literal['r', 'rt', 'w']",
198+
**kwargs: Any,
199+
) -> "TextIO": # pylint: disable=arguments-differ
200+
...
201+
182202
def open(
183203
self,
184204
path: AnyFSPath,
185205
mode: str = "r",
186-
**kwargs,
187-
) -> "IO": # pylint: disable=arguments-differ
206+
**kwargs: Any,
207+
) -> "IO[Any]": # pylint: disable=arguments-differ
188208
if "b" in mode:
189209
kwargs.pop("encoding", None)
190210
return self.fs.open(path, mode=mode, **kwargs)
@@ -354,7 +374,7 @@ def makedirs(self, path: AnyFSPath, **kwargs: Any) -> None:
354374

355375
def put_file(
356376
self,
357-
from_file: Union[AnyFSPath, IO],
377+
from_file: Union[AnyFSPath, "BinaryIO"],
358378
to_info: AnyFSPath,
359379
callback: FsspecCallback = DEFAULT_CALLBACK,
360380
size: int = None,
@@ -363,7 +383,7 @@ def put_file(
363383
if size:
364384
callback.set_size(size)
365385
if hasattr(from_file, "read"):
366-
stream = callback.wrap_attr(cast("IO", from_file))
386+
stream = callback.wrap_attr(cast("BinaryIO", from_file))
367387
self.upload_fobj(stream, to_info, size=size)
368388
else:
369389
assert isinstance(from_file, str)

dvc/fs/dvc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def _ls(fs, path):
4141

4242

4343
def _merge_info(repo, fs_info, dvc_info):
44-
from dvc.utils import is_exec
44+
from dvc.fs.utils import is_exec
4545

4646
ret = {"repo": repo}
4747

dvc/fs/http.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import threading
2-
from typing import IO, Union
2+
from typing import BinaryIO, Union
33

44
from funcy import cached_property, memoize, wrap_with
55

@@ -141,7 +141,7 @@ def unstrip_protocol(self, path: str) -> str:
141141

142142
def put_file(
143143
self,
144-
from_file: Union[AnyFSPath, IO],
144+
from_file: Union[AnyFSPath, BinaryIO],
145145
to_info: AnyFSPath,
146146
callback: FsspecCallback = DEFAULT_CALLBACK,
147147
size: int = None,

dvc/fs/utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import os
2+
import stat
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from .base import AnyFSPath
7+
8+
9+
def is_exec(mode: int) -> bool:
10+
return bool(mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))
11+
12+
13+
def relpath(path: "AnyFSPath", start: "AnyFSPath" = os.curdir) -> "AnyFSPath":
14+
path = os.fspath(path)
15+
start = os.path.abspath(os.fspath(start))
16+
17+
# Windows path on different drive than curdir doesn't have relpath
18+
if os.name == "nt":
19+
# Since python 3.8 os.realpath resolves network shares to their UNC
20+
# path. So, to be certain that relative paths correctly captured,
21+
# we need to resolve to UNC path first. We resolve only the drive
22+
# name so that we don't follow any 'real' symlinks on the path
23+
def resolve_network_drive_windows(path_to_resolve):
24+
drive, tail = os.path.splitdrive(path_to_resolve)
25+
return os.path.join(os.path.realpath(drive), tail)
26+
27+
path = resolve_network_drive_windows(os.path.abspath(path))
28+
start = resolve_network_drive_windows(start)
29+
if not os.path.commonprefix([start, path]):
30+
return path
31+
return os.path.relpath(path, start)

dvc/objects/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,5 @@ def __init__(
5757
) -> None:
5858
settings.setdefault("disk_pickle_protocol", 4)
5959
super().__init__(
60-
directory=directory, timeout=timeout, disk=Disk, **settings
60+
directory=directory, timeout=timeout, disk=disk, **settings
6161
)

dvc/objects/file.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ def check(self, odb: "ObjectDB", check_hash: bool = True):
6565
self._check_hash(odb)
6666

6767
def _check_hash(self, odb):
68-
from dvc.data.stage import get_file_hash
69-
from dvc.objects.errors import ObjectFormatError
68+
from .errors import ObjectFormatError
69+
from .hash import hash_file
7070

71-
_, actual = get_file_hash(
71+
_, actual = hash_file(
7272
self.fs_path, self.fs, self.hash_info.name, odb.state
7373
)
7474

0 commit comments

Comments
 (0)