-
Notifications
You must be signed in to change notification settings - Fork 7.1k
Closing streams to avoid testing issues #6128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
b6ae4df
05caddc
f87e97d
a830b5d
ab5e918
7b0613c
d3cfb2a
8fc5551
6a72184
f63d6e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -107,7 +107,9 @@ def _prepare_sample( | |||||
ann_path, ann_buffer = ann_data | ||||||
|
||||||
image = EncodedImage.from_file(image_buffer) | ||||||
image_buffer.close() | ||||||
ann = read_mat(ann_buffer) | ||||||
ann_buffer.close() | ||||||
Comment on lines
+105
to
+107
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of doing that in every dataset individually, can't we just do it in
and
? I think so far we don't have a case where we need to read from the same file handle twice. Plus, that would only work if the stream is seekable, which I don't know if we can guarantee. |
||||||
|
||||||
return dict( | ||||||
label=Label.from_category(category, categories=self._categories), | ||||||
|
@@ -186,10 +188,11 @@ def _is_not_rogue_file(self, data: Tuple[str, Any]) -> bool: | |||||
|
||||||
def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]: | ||||||
path, buffer = data | ||||||
|
||||||
image = EncodedImage.from_file(buffer) | ||||||
buffer.close() | ||||||
return dict( | ||||||
path=path, | ||||||
image=EncodedImage.from_file(buffer), | ||||||
image=image, | ||||||
label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories), | ||||||
) | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,8 +39,8 @@ def __init__( | |
self.fieldnames = fieldnames | ||
|
||
def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]: | ||
for _, file in self.datapipe: | ||
file = (line.decode() for line in file) | ||
for _, fh in self.datapipe: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm ok with the closing here, but why the rename? Can you revert that? |
||
file = (line.decode() for line in fh) | ||
|
||
if self.fieldnames: | ||
fieldnames = self.fieldnames | ||
|
@@ -58,6 +58,8 @@ def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]: | |
for line in csv.DictReader(file, fieldnames=fieldnames, dialect="celeba"): | ||
yield line.pop("image_id"), line | ||
|
||
fh.close() | ||
|
||
|
||
NAME = "celeba" | ||
|
||
|
@@ -142,6 +144,7 @@ def _prepare_sample( | |
path, buffer = image_data | ||
|
||
image = EncodedImage.from_file(buffer) | ||
buffer.close() | ||
(_, identity), (_, attributes), (_, bounding_box), (_, landmarks) = ann_data | ||
|
||
return dict( | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -15,6 +15,8 @@ | |||||||||||||||||
|
||||||||||||||||||
from .._api import register_dataset, register_info | ||||||||||||||||||
|
||||||||||||||||||
from torchdata import janitor | ||||||||||||||||||
|
||||||||||||||||||
NAME = "clevr" | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
|
@@ -62,10 +64,12 @@ def _add_empty_anns(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[str, Binary | |||||||||||||||||
def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Optional[Dict[str, Any]]]) -> Dict[str, Any]: | ||||||||||||||||||
image_data, scenes_data = data | ||||||||||||||||||
path, buffer = image_data | ||||||||||||||||||
image = EncodedImage.from_file(buffer) | ||||||||||||||||||
buffer.close() | ||||||||||||||||||
|
||||||||||||||||||
return dict( | ||||||||||||||||||
path=path, | ||||||||||||||||||
image=EncodedImage.from_file(buffer), | ||||||||||||||||||
image=image, | ||||||||||||||||||
label=Label(len(scenes_data["objects"])) if scenes_data else None, | ||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
|
@@ -97,6 +101,8 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, | |||||||||||||||||
buffer_size=INFINITE_BUFFER_SIZE, | ||||||||||||||||||
) | ||||||||||||||||||
else: | ||||||||||||||||||
for i in scenes_dp: | ||||||||||||||||||
janitor(i) | ||||||||||||||||||
Comment on lines
+103
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Plus, do we even need to use
Suggested change
|
||||||||||||||||||
dp = Mapper(images_dp, self._add_empty_anns) | ||||||||||||||||||
|
||||||||||||||||||
return Mapper(dp, self._prepare_sample) | ||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
INFINITE_BUFFER_SIZE, | ||
getitem, | ||
read_categories_file, | ||
close_buffer, | ||
path_accessor, | ||
hint_sharding, | ||
hint_shuffling, | ||
|
@@ -174,9 +175,10 @@ def _classify_meta(self, data: Tuple[str, Any]) -> Optional[int]: | |
|
||
def _prepare_image(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]: | ||
path, buffer = data | ||
image = close_buffer(EncodedImage.from_file, buffer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
return dict( | ||
path=path, | ||
image=EncodedImage.from_file(buffer), | ||
image=image, | ||
) | ||
|
||
def _prepare_sample( | ||
|
@@ -187,9 +189,11 @@ def _prepare_sample( | |
anns, image_meta = ann_data | ||
|
||
sample = self._prepare_image(image_data) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you revert the formatting changes? |
||
# this method is only called if we have annotations | ||
annotations = cast(str, self._annotations) | ||
sample.update(self._ANN_DECODERS[annotations](self, anns, image_meta)) | ||
|
||
return sample | ||
|
||
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -109,10 +109,11 @@ def _prepare_test_data(self, data: Tuple[str, BinaryIO]) -> Tuple[None, Tuple[st | |
return None, data | ||
|
||
def _classifiy_devkit(self, data: Tuple[str, BinaryIO]) -> Optional[int]: | ||
name, binary_io = data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you revert this, since |
||
return { | ||
"meta.mat": ImageNetDemux.META, | ||
"ILSVRC2012_validation_ground_truth.txt": ImageNetDemux.LABEL, | ||
}.get(pathlib.Path(data[0]).name) | ||
}.get(pathlib.Path(name).name) | ||
|
||
# Although the WordNet IDs (wnids) are unique, the corresponding categories are not. For example, both n02012849 | ||
# and n03126707 are labeled 'crane' while the first means the bird and the latter means the construction equipment | ||
|
@@ -123,12 +124,14 @@ def _classifiy_devkit(self, data: Tuple[str, BinaryIO]) -> Optional[int]: | |
|
||
def _extract_categories_and_wnids(self, data: Tuple[str, BinaryIO]) -> List[Tuple[str, str]]: | ||
synsets = read_mat(data[1], squeeze_me=True)["synsets"] | ||
return [ | ||
results = [ | ||
(self._WNID_MAP.get(wnid, category.split(",", 1)[0]), wnid) | ||
for _, wnid, category, _, num_children, *_ in synsets | ||
# if num_children > 0, we are looking at a superclass that has no direct instance | ||
if num_children == 0 | ||
] | ||
data[1].close() | ||
return results | ||
|
||
def _imagenet_label_to_wnid(self, imagenet_label: str, *, wnids: Tuple[str, ...]) -> str: | ||
return wnids[int(imagenet_label) - 1] | ||
|
@@ -151,11 +154,13 @@ def _prepare_sample( | |
data: Tuple[Optional[Tuple[Label, str]], Tuple[str, BinaryIO]], | ||
) -> Dict[str, Any]: | ||
label_data, (path, buffer) = data | ||
image = EncodedImage.from_file(buffer) | ||
buffer.close() | ||
|
||
return dict( | ||
dict(zip(("label", "wnid"), label_data if label_data else (None, None))), | ||
path=path, | ||
image=EncodedImage.from_file(buffer), | ||
image=image, | ||
) | ||
|
||
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The errors we have seen in our test suite have never been with these files, but only with archives.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tests complain that archive stream is not closed. This is because child (unpacked file stream) also remains open and referencing parent. In pytorch/pytorch#78952 and pytorch/data#560 we introduced mechanism to close parent steams when every child is closed.