Add new dataset (#1227)

tastelikefeet · web-flow · commit a3a52d01f28c · 2024-06-26T15:58:46.000+08:00
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
@@ -21,7 +21,7 @@
 
 from swift.utils import get_logger, get_seed, is_dist, is_local_master, read_from_jsonl, transform_jsonl_to_df
 from swift.utils.torch_utils import _find_local_mac
-from .media import MediaCache
+from .media import MediaCache, MediaTag
 from .preprocess import (AlpacaPreprocessor, ClsPreprocessor, ComposePreprocessor, ConversationsPreprocessor,
                          ListPreprocessor, PreprocessFunc, RenameColumnsPreprocessor, SmartPreprocessor,
                          TextGenerationPreprocessor, preprocess_sharegpt)
@@ -162,6 +162,8 @@ class DatasetName:
     midefics = 'midefics'
     gqa = 'gqa'
     text_caps = 'text-caps'
+    refcoco_unofficial_caption = 'refcoco-unofficial-caption'
+    refcoco_unofficial_grounding = 'refcoco-unofficial-grounding'
     a_okvqa = 'a-okvqa'
     okvqa = 'okvqa'
     ocr_vqa = 'ocr-vqa'
@@ -1112,6 +1114,79 @@ def preprocess(row):
         load_from_cache_file=False).filter(lambda row: row.get('response')).rename_columns({'image': 'images'})
 
 
+def preprocess_refcoco_unofficial_caption(dataset):
+
+    cache_dir = MediaCache.download(
+        'https://www.modelscope.cn/api/v1/datasets/we_dont_produce_water/'
+        'coco_res/repo?Revision=master&FilePath=coco_2014.zip', 'coco2014')
+
+    def preprocess(row):
+        caption = row['captions'][0]
+        bbox = row['bbox']
+        image_path = os.path.join(cache_dir, row['image_path'].replace('coco/train2014', 'train2014'))
+        media_tag = MediaTag(media_type='image', task_type='grounding_caption')
+        for i in range(len(bbox)):
+            bbox[i] = round(float(bbox[i]))
+        res = {}
+
+        objects = [[caption, bbox]]
+        media_tag(res, [image_path])
+        res['images'] = [image_path]
+        res['objects'] = json.dumps(objects)
+        if not os.path.exists(image_path):
+            res['response'] = ''
+        return res
+
+    return dataset.map(preprocess, load_from_cache_file=False).filter(lambda row: row.get('response'))
+
+
+register_dataset(
+    DatasetName.refcoco_unofficial,
+    'swift/refcoco', [],
+    preprocess_func=preprocess_refcoco_unofficial_caption,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcoco',
+    huge_dataset=True,
+    tags=['multi-modal', 'en', 'caption'])
+
+
+def preprocess_refcoco_unofficial_grounding(dataset):
+
+    cache_dir = MediaCache.download(
+        'https://www.modelscope.cn/api/v1/datasets/we_dont_produce_water/'
+        'coco_res/repo?Revision=master&FilePath=coco_2014.zip', 'coco2014')
+
+    def preprocess(row):
+        caption = row['captions'][0]
+        bbox = row['bbox']
+        image_path = os.path.join(cache_dir, row['image_path'].replace('coco/train2014', 'train2014'))
+        media_tag = MediaTag(media_type='image', task_type='ref_grounding')
+        for i in range(len(bbox)):
+            bbox[i] = round(float(bbox[i]))
+        res = {}
+
+        objects = [[caption, bbox]]
+        media_tag(res, [image_path])
+        res['images'] = [image_path]
+        res['objects'] = json.dumps(objects)
+        if not os.path.exists(image_path):
+            res['response'] = ''
+        return res
+
+    return dataset.map(preprocess, load_from_cache_file=False).filter(lambda row: row.get('response'))
+
+
+register_dataset(
+    DatasetName.refcoco_unofficial_grounding,
+    'swift/refcoco', [],
+    preprocess_func=preprocess_refcoco_unofficial_grounding,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcoco',
+    huge_dataset=True,
+    tags=['multi-modal', 'en', 'grounding'])
+
 register_dataset(
     DatasetName.text_caps,
     'swift/TextCaps', [],
diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
@@ -24,7 +24,7 @@ class MediaTag:
                 ('<bbox>', '<ref-object>'),
                 ('The object at position <bbox>', '<ref-object>'),
                 ('This <bbox> is', '<ref-object>'),
-                ('What is the thing at <bbox>', '<ref-object>'),
+                ('What is the object at <bbox>', '<ref-object>'),
                 ('Describe <bbox>', '<ref-object>'),
                 ('<bbox> is', '<ref-object>'),
                 ('The bounding box coordinate <bbox> contains', '<ref-object>'),
@@ -62,14 +62,13 @@ def __init__(self,
         self.task_type = task_type
         self.media_tag = media_tag or '<unused_tag>'
 
-    def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List = None) -> None:
+    def __call__(self, d: Dict[str, Any], medias: Union[tuple, list]) -> None:
         """Format the query/response/history with medias
 
         Args:
             d: A dict contains history/query/response
             medias: A list of medias(one round, multiple medias),
                     a single media(one round, one media), or a tuple of media list(multiple rounds)
-            objects: A list of object-bbox pairs(one round), or a tuple of object-bbox lists(multiple rounds)
         """
         if not self.media_type:
             return
@@ -83,7 +82,8 @@ def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List
             pass
         elif self.task_type in ('ref_grounding', 'grounding_caption'):
             lang = np.random.choice(['en', 'zh'], p=[0.8, 0.2])
-            query, response = np.random.choice(self.task_prompts[self.task_type][lang])
+            prompts = self.task_prompts[self.task_type][lang]
+            query, response = prompts[np.random.choice(range(len(prompts)))]
         elif self.task_type == 'ocr':
             raise NotImplementedError
         else:
@@ -101,8 +101,7 @@ def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List
         if 'history' in d:
             d['history'] = history
         d['query'] = query
-        if 'response' in d:
-            d['response'] = response
+        d['response'] = response
 
 
 class MediaCache: