diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 5a1c032579..b9cc49db51 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -204,8 +204,8 @@ |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)| |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)| |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)| -|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)| -|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)| +|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)| +|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)| |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)| |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)| |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)| @@ -316,6 +316,7 @@ |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|self_attention.query_key_value|glm4v|✘|✘||vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| +|llava1_5-7b-chat|[huangjintao/llava-1.5-7b-hf](https://modelscope.cn/models/huangjintao/llava-1.5-7b-hf/summary)|q_proj, k_proj, v_proj|llava1_5|✔|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| |llava1_6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)| |llava1_6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|✔|✘||vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)| |llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|q_proj, k_proj, v_proj|llama-llava-next|✔|✘||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)| diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md index 41fe33d343..6ffb583417 100644 --- a/docs/source_en/LLM/Supported-models-datasets.md +++ b/docs/source_en/LLM/Supported-models-datasets.md @@ -204,8 +204,8 @@ The table below introcudes all models supported by SWIFT: |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)| |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)| |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)| -|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)| -|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)| +|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)| +|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|✔|✔|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)| |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)| |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)| |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)| @@ -316,6 +316,7 @@ The table below introcudes all models supported by SWIFT: |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|self_attention.query_key_value|glm4v|✘|✘||vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| +|llava1_5-7b-chat|[huangjintao/llava-1.5-7b-hf](https://modelscope.cn/models/huangjintao/llava-1.5-7b-hf/summary)|q_proj, k_proj, v_proj|llava1_5|✔|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| |llava1_6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)| |llava1_6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|✔|✘||vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)| |llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|q_proj, k_proj, v_proj|llama-llava-next|✔|✘||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)| diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py index 5901971847..bc7ed6902d 100644 --- a/swift/llm/deploy.py +++ b/swift/llm/deploy.py @@ -510,6 +510,8 @@ def _generate_stream(): async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request) -> ChatCompletionResponse: global _args assert _args is not None + if request.stop is None: + request.stop = [] if _args.infer_backend == 'vllm': return await inference_vllm_async(request, raw_request) else: @@ -520,6 +522,8 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re async def create_completion(request: CompletionRequest, raw_request: Request) -> CompletionResponse: global _args assert _args is not None + if request.stop is None: + request.stop = [] if _args.infer_backend == 'vllm': return await inference_vllm_async(request, raw_request) else: diff --git a/swift/llm/export.py b/swift/llm/export.py index 5abbeadfb5..5a4a901e37 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -94,13 +94,13 @@ def llm_export(args: ExportArguments) -> None: logger.info(f'args: {args}') seed_everything(args.seed) if args.to_peft_format: - assert args.sft_type == 'lora' + assert args.sft_type == 'lora', f'args.sft_type: {args.sft_type}' args.ckpt_dir = swift_to_peft_format(args.ckpt_dir) if args.merge_lora: merge_lora(args, device_map=args.merge_device_map) if args.quant_bits > 0: _args = args - assert args.quantization_bit == 0 + assert args.quantization_bit == 0, f'args.quantization_bit: {args.quantization_bit}' assert args.sft_type == 'full', 'you need to merge lora' if args.quant_method == 'awq': from awq import AutoAWQForCausalLM @@ -108,11 +108,13 @@ def llm_export(args: ExportArguments) -> None: args, device_map=args.quant_device_map, verbose=False, automodel_class=AutoAWQForCausalLM) awq_model_quantize(model, template.tokenizer) model.save_quantized(args.quant_output_dir) - else: # gptq + elif args.quant_method == 'gptq': model, template = prepare_model_template(args, device_map=args.quant_device_map, verbose=False) gptq_quantizer = gptq_model_quantize(model, template.tokenizer) model.config.quantization_config.pop('dataset', None) gptq_quantizer.save(model, args.quant_output_dir) + else: + raise ValueError(f'args.quant_method: {args.quant_method}') logger.info(get_model_info(model)) show_layers(model) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index b1cf65eb96..b54e432c68 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -2256,7 +2256,7 @@ def get_dataset( assert model_name is not None and model_author is not None dataset = _preprocess_self_cognition_dataset(dataset, model_name, model_author) - def _reduce_column(row): + def _reduce_column(row: Dict[str, Any]) -> Dict[str, Any]: res = {} if 'query' in row and isinstance(row['query'], (list, tuple)): res['query'] = np.random.choice(row['query']) diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py index d30b147463..5b26da1074 100644 --- a/swift/llm/utils/media.py +++ b/swift/llm/utils/media.py @@ -1,6 +1,5 @@ import os import shutil -import time from typing import Any, Dict, List, Literal, Optional, Union import numpy as np diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index e7a67e2600..10d7de8709 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -189,6 +189,7 @@ class ModelType: atom_7b = 'atom-7b' atom_7b_chat = 'atom-7b-chat' # llava + llava1_5_7b_chat = 'llava1_5-7b-chat' llava1_6_mistral_7b_instruct = 'llava1_6-mistral-7b-instruct' llava1_6_yi_34b_instruct = 'llava1_6-yi-34b-instruct' llama3_llava_next_8b = 'llama3-llava-next-8b' @@ -4645,6 +4646,25 @@ def _new_generate(inputs=None, *args, **kwargs): model.generate = _new_generate +@register_model( + ModelType.llava1_5_7b_chat, + 'huangjintao/llava-1.5-7b-hf', + LoRATM.llama, + TemplateType.llava1_5, + eos_token='', + support_flash_attn=True, + requires=['transformers>=4.36'], + tags=['multi-modal', 'vision'], + hf_model_id='llava-hf/llava-1.5-7b-hf') +def get_model_tokenizer_llava1_5(model_dir: str, *args, **kwargs): + from transformers import AutoProcessor, LlavaForConditionalGeneration + processor = AutoProcessor.from_pretrained(model_dir) + model, tokenizer = get_model_tokenizer_with_flash_attn( + model_dir, *args, automodel_class=LlavaForConditionalGeneration, **kwargs) + tokenizer.processor = processor + return model, tokenizer + + @register_model( ModelType.llava1_6_yi_34b_instruct, 'AI-ModelScope/llava-v1.6-34b', diff --git a/swift/llm/utils/protocol.py b/swift/llm/utils/protocol.py index 5e5aaef5bc..1ba24e734a 100644 --- a/swift/llm/utils/protocol.py +++ b/swift/llm/utils/protocol.py @@ -42,7 +42,7 @@ class XRequestConfig: n: int = 1 seed: Optional[int] = None - stop: List[str] = field(default_factory=list) + stop: Optional[List[str]] = None stream: bool = False best_of: Optional[int] = None diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 7d1a988f9e..d3f5f89812 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -40,6 +40,7 @@ class TemplateType: chatglm3 = 'chatglm3' llama = 'llama' # llama2 llama3 = 'llama3' + llava1_5 = 'llava1_5' llava_mistral_instruct = 'llava-mistral-instruct' llava_yi_instruct = 'llava-yi-instruct' llava_llama_instruct = 'llava-llama-instruct' @@ -639,6 +640,11 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = res['inputs_embeds'] = inputs_embeds else: res['input_ids'] = input_ids + # multimodal + pixel_values = [b['pixel_values'] for b in batch if b.get('pixel_values') is not None] + if len(pixel_values) > 0: + res['pixel_values'] = torch.concat(pixel_values) + if loss_scale is not None: res['loss_scale'] = loss_scale return res @@ -726,7 +732,7 @@ def register_template(template_type: str, template: Template, *, exist_ok: bool register_template( TemplateType.default, - Template([], ['### Human:\n', '{{QUERY}}\n\n', '### Assistant:\n'], ['\n\n'], [['eos_token_id']], DEFAULT_SYSTEM, + Template([], ['### Human:\n{{QUERY}}\n\n### Assistant:\n'], ['\n\n'], [['eos_token_id']], DEFAULT_SYSTEM, ['{{SYSTEM}}\n\n'])) @@ -930,7 +936,7 @@ def _init_template(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs) -> class GLM4VTemplate(GLMTemplate): def __init__(self): - super().__init__([], ['<|user|>\n', '{{QUERY}}<|assistant|>'], [], ['<|endoftext|>'], None, + super().__init__([], ['<|user|>\n{{QUERY}}<|assistant|>'], [], ['<|endoftext|>'], None, ['<|system|>\n{{SYSTEM}}']) def check_example(self, example): @@ -982,7 +988,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = register_template( TemplateType.yi_vl, - YiVLTemplate([], ['### Human: ', '{{QUERY}}\n### Assistant:'], ['\n'], ['\n###'], yi_vl_default_system, + YiVLTemplate([], ['### Human: {{QUERY}}\n### Assistant:'], ['\n'], ['\n###'], yi_vl_default_system, ['{{SYSTEM}}\n\n']), use_model=True, infer_media_type='round', @@ -1202,8 +1208,8 @@ class InternvlTemplate(Template): num_image_token = 256 def __init__(self): - super().__init__([''], ['<|im_start|>user\n', '{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'], - ['<|im_end|>'], self.system, ['<|im_start|>system\n{{SYSTEM}}']) + super().__init__([''], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'], + ['<|im_end|>'], self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>']) def check_example(self, example): images = example.get('images') or [] @@ -1250,10 +1256,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]: res = super().data_collator(batch, padding_to) assert all('pixel_values' in b for b in batch), 'Temporarily, Interval only supports data with images' - pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b] image_flags = [b['image_flags'] for b in batch if 'image_flags' in b] - if pixel_values: - res['pixel_values'] = torch.concat(pixel_values) if image_flags: res['image_flags'] = torch.concat(image_flags) return res @@ -1267,8 +1270,8 @@ class InternvlPhi3Template(InternvlTemplate): system = 'You are an AI assistant whose name is Phi-3.' def __init__(self): - Template.__init__(self, [''], ['<|user|>\n', [-100], '{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], - ['<|end|>'], self.system, ['<|system|>\n{{SYSTEM}}<|end|>\n']) + Template.__init__(self, [''], ['<|user|>\n{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], ['<|end|>'], + self.system, ['<|system|>\n{{SYSTEM}}<|end|>\n']) register_template( @@ -1320,6 +1323,34 @@ def __init__(self): 'and other non-computer science questions, you will refuse to answer\n'))) +class Llava1_5Template(Template): + + def __init__(self): + super().__init__([''], ['USER: {{QUERY}}\nASSISTANT:'], ['\n'], ['']) + + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]: + assert media_type == 'image' + return ['\n'] + + def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: + inputs, _ = super().encode(example) + if len(inputs) == 0: + return inputs, {} + images_path = example.get('images') or [] + images = [] + for image_path in images_path: + image = _read_from_path(image_path) + images.append(image) + image_processor = self.tokenizer.processor.image_processor + if images: + inputs['pixel_values'] = image_processor(images, return_tensors='pt')['pixel_values'].to(self.model.dtype) + return inputs, {} + + +register_template( + TemplateType.llava1_5, Llava1_5Template(), use_model=True, infer_media_type='round', lazy_tokenize=True) + + class LLavaTemplate(Template): def __init__(self): @@ -1387,8 +1418,8 @@ def __init__(self): class LLavaLlamaTemplate(Template): - llavallama_query_template = '<|start_header_id|>user<|end_header_id|>\n\n' \ - '{{QUERY}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' + llavallama_query_template = ('<|start_header_id|>user<|end_header_id|>\n\n' + '{{QUERY}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n') def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example): return ['\n'] @@ -1407,13 +1438,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any inputs['pixel_values'] = pixel_values.to(self.model.dtype) return inputs, {} - def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]: - res = super().data_collator(batch, padding_to) - pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b] - if pixel_values: - res['pixel_values'] = torch.concat(pixel_values) - return res - register_template( TemplateType.llava_llama_instruct, @@ -1456,9 +1480,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]: res = super().data_collator(batch, padding_to) - pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b] - if pixel_values: - res['pixel_values'] = torch.concat(pixel_values) token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch] token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0) res['token_type_ids'] = token_type_ids @@ -1519,9 +1540,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]: res = super().data_collator(batch, padding_to) - pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b] - if pixel_values: - res['pixel_values'] = torch.concat(pixel_values) + if 'pixel_values' in res: res['image_sizes'] = torch.concat([b['image_sizes'] for b in batch if 'image_sizes' in b]) return res @@ -1554,7 +1573,7 @@ class LLavaQwenTemplate(LLavaTemplate): llavayi_query_template = 'You are a helpful assistant' def __init__(self): - Template.__init__(self, [], ['<|im_start|>user\n', '{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'], + Template.__init__(self, [], ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'], ['<|im_end|>\n'], ['<|im_end|>'], self.llavayi_query_template, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n']) @@ -1827,7 +1846,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]: register_template( TemplateType.minicpm_v, - MiniCPMVTemplate(['{{SYSTEM}}'], ['<用户>', '{{QUERY}}'], [], ['']), + MiniCPMVTemplate(['{{SYSTEM}}'], ['<用户>{{QUERY}}'], [], ['']), use_model=True, lazy_tokenize=True, infer_media_type='dialogue', @@ -1837,7 +1856,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]: register_template( TemplateType.minicpm_v_v2_5, MiniCPMVTemplate(['<|begin_of_text|>{{SYSTEM}}'], [ - '<|start_header_id|>user<|end_header_id|>\n\n', '{{QUERY}}<|eot_id|>' + '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>' '<|start_header_id|>assistant<|end_header_id|>\n\n' ], ['<|eot_id|>'], ['<|eot_id|>'], is_v2_5=True), @@ -1893,7 +1912,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]: class mPlugOwl2Template(Template): def __init__(self): - super().__init__(['{{SYSTEM}}'], ['USER: ', '{{QUERY}}ASSISTANT:'], [''], [['eos_token_id']]) + super().__init__(['{{SYSTEM}}'], ['USER: {{QUERY}}ASSISTANT:'], [''], [['eos_token_id']]) def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]: assert media_type == 'image' diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index 43029088a4..a47e63a223 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -93,9 +93,6 @@ def download_dataset(model_id: str, files: List[str], force_download: bool = Fal def _msdataset_ddp_load(*args, **kwargs): with safe_ddp_context(): dataset = _old_msdataset_load(*args, **kwargs) - - if is_dist(): # sync - dist.barrier() return dataset # monkey patching diff --git a/swift/utils/utils.py b/swift/utils/utils.py index ad0cae4754..519a5667b0 100644 --- a/swift/utils/utils.py +++ b/swift/utils/utils.py @@ -27,6 +27,8 @@ def safe_ddp_context(): yield if is_dist() and is_local_master(): dist.barrier() + if is_dist(): # sync + dist.barrier() def check_json_format(obj: Any) -> Any: