Skip to content

Commit e19cf8e

Browse files
authored
Merge pull request #128 from kurusugawa-computer/add-simple-annotation-parser
アノテーションzipをタスクごとにパースするメソッドを追加
2 parents 553864e + 236cf68 commit e19cf8e

File tree

3 files changed

+119
-2
lines changed

3 files changed

+119
-2
lines changed

annofabapi/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.27.4'
1+
__version__ = '0.27.5'

annofabapi/parser.py

+90-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import abc
22
import json
33
import os
4+
import re
45
import warnings
56
import zipfile
67
from pathlib import Path
7-
from typing import Any, Iterator, List, Optional
8+
from typing import Any, Generic, Iterator, List, Optional, TypeVar
89

910
from annofabapi.dataclass.annotation import FullAnnotation, SimpleAnnotation
1011
from annofabapi.exceptions import AnnotationOuterFileNotFoundError
@@ -292,6 +293,32 @@ def open_outer_file(self, data_uri: str):
292293
raise AnnotationOuterFileNotFoundError(str(outer_file_path))
293294

294295

296+
S = TypeVar("S", bound=SimpleAnnotationParser)
297+
298+
299+
class SimpleAnnotationParserGroupByTask(Generic[S]):
300+
"""
301+
Simple Annotationのparserをタスクごとにまとめたもの。
302+
303+
304+
Args:
305+
task_id: タスクID
306+
parser_list: タスク配下のJSONに関するパーサのList
307+
308+
"""
309+
def __init__(self, task_id: str, parser_list: List[S]):
310+
self.__task_id = task_id
311+
self.__parser_list = parser_list
312+
313+
@property
314+
def task_id(self) -> str:
315+
return self.__task_id
316+
317+
@property
318+
def parser_list(self) -> List[S]:
319+
return self.__parser_list
320+
321+
295322
def __parse_annotation_dir(annotaion_dir_path: Path, clazz) -> Iterator[Any]:
296323
for task_dir in annotaion_dir_path.iterdir():
297324
if not task_dir.is_dir():
@@ -333,6 +360,68 @@ def lazy_parse_full_annotation_dir(annotaion_dir_path: Path) -> Iterator[SimpleA
333360
return __parse_annotation_dir(annotaion_dir_path, FullAnnotationDirParser)
334361

335362

363+
def lazy_parse_simple_annotation_zip_by_task(
364+
zip_file_path: Path) -> Iterator[SimpleAnnotationParserGroupByTask[SimpleAnnotationZipParser]]:
365+
"""
366+
Simpleアノテーションzipファイル内を探索し、タスクごとに各annotationをparse可能なオブジェクトの列を返します。
367+
368+
Args:
369+
zip_file_path: annofabからダウンロードしたsimple annotationのzipファイルへのパス
370+
371+
Yields:
372+
対象タスク内の、annotationの遅延Parseが可能なインスタンス列
373+
"""
374+
def is_input_data_info_in_task(zip_info: zipfile.ZipInfo, task_id: str) -> bool:
375+
"""
376+
指定されたtask_id配下の入力データJSONかどうか
377+
"""
378+
paths = [p for p in zip_info.filename.split("/") if len(p) != 0]
379+
if len(paths) != 2:
380+
return False
381+
if paths[0] != task_id:
382+
return False
383+
if not paths[1].endswith(".json"):
384+
return False
385+
386+
return True
387+
388+
with zipfile.ZipFile(zip_file_path, mode="r") as file:
389+
info_list: List[zipfile.ZipInfo] = file.infolist()
390+
# 1階層目のディレクトリをtask_idとみなす
391+
task_info_list = [e for e in info_list if e.is_dir() and len(re.findall("/", e.filename)) == 1]
392+
393+
for task_info in task_info_list:
394+
task_id = task_info.filename.split("/")[0]
395+
parser_list = [
396+
SimpleAnnotationZipParser(file, e.filename)
397+
for e in info_list
398+
if is_input_data_info_in_task(e, task_id)
399+
]
400+
401+
yield SimpleAnnotationParserGroupByTask(task_id, parser_list)
402+
403+
404+
def lazy_parse_simple_annotation_dir_by_task(
405+
annotaion_dir_path: Path) -> Iterator[SimpleAnnotationParserGroupByTask[SimpleAnnotationDirParser]]:
406+
"""
407+
Simpleアノテーションzipを展開したディレクトリ内を探索し、タスクごとに各annotationをparse可能なオブジェクトの列を返します。
408+
409+
Args:
410+
annotaion_dir_path: annofabからダウンロードしたsimple annotationのzipファイルを展開したディレクトリ
411+
412+
Yields:
413+
対象タスク内の、annotationの遅延Parseが可能なインスタンス列
414+
"""
415+
416+
for task_dir in annotaion_dir_path.iterdir():
417+
if not task_dir.is_dir():
418+
continue
419+
420+
task_id = task_dir.name
421+
parser_list = [SimpleAnnotationDirParser(e) for e in task_dir.iterdir() if e.is_file() and e.suffix == ".json"]
422+
yield SimpleAnnotationParserGroupByTask(task_id, parser_list)
423+
424+
336425
def __parse_annotation_zip(zip_file_path: Path, clazz) -> Iterator[Any]:
337426
def lazy_parser(zip_file: zipfile.ZipFile, info: zipfile.ZipInfo) -> Optional[Any]:
338427
paths = [p for p in info.filename.split("/") if len(p) != 0]

tests/test_local_parser.py

+28
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,34 @@ def test_simple_annotation_dir(self):
9999
with pytest.raises(AnnotationOuterFileNotFoundError):
100100
parser.open_outer_file("foo")
101101

102+
def test_lazy_parse_simple_annotation_zip_by_task(self):
103+
zip_path = Path(test_dir / "simple-annotation-v2.zip")
104+
task_parser_list = list(annofabapi.parser.lazy_parse_simple_annotation_zip_by_task(zip_path))
105+
106+
assert len(task_parser_list) == 2
107+
assert len([e for e in task_parser_list if e.task_id == "sample_1"]) == 1
108+
assert len([e for e in task_parser_list if e.task_id == "sample_0"]) == 1
109+
110+
task_parser = [e for e in task_parser_list if e.task_id == "sample_1"][0]
111+
parser_list = task_parser.parser_list
112+
assert len(parser_list) == 2
113+
assert len([e for e in parser_list if e.input_data_id == "c6e1c2ec-6c7c-41c6-9639-4244c2ed2839"]) == 1
114+
assert len([e for e in parser_list if e.input_data_id == "c86205d1-bdd4-4110-ae46-194e661d622b"]) == 1
115+
116+
def test_lazy_parse_simple_annotation_dir_by_task(self):
117+
zip_path = Path(test_dir / "simple-annotation-v2")
118+
task_parser_list = list(annofabapi.parser.lazy_parse_simple_annotation_dir_by_task(zip_path))
119+
120+
assert len(task_parser_list) == 2
121+
assert len([e for e in task_parser_list if e.task_id == "sample_1"]) == 1
122+
assert len([e for e in task_parser_list if e.task_id == "sample_0"]) == 1
123+
124+
task_parser = [e for e in task_parser_list if e.task_id == "sample_1"][0]
125+
parser_list = task_parser.parser_list
126+
assert len(parser_list) == 2
127+
assert len([e for e in parser_list if e.input_data_id == "c6e1c2ec-6c7c-41c6-9639-4244c2ed2839"]) == 1
128+
assert len([e for e in parser_list if e.input_data_id == "c86205d1-bdd4-4110-ae46-194e661d622b"]) == 1
129+
102130

103131
class TestFullAnnotation:
104132
def test_full_annotation_zip(self):

0 commit comments

Comments
 (0)