|
44 | 44 |
|
45 | 45 | PHI3VISION_IDS = ['microsoft/phi-3-vision-128k-instruct']
|
46 | 46 |
|
| 47 | +QWENVL_IDS = ['Qwen/Qwen-VL-Chat'] |
| 48 | + |
47 | 49 | results = []
|
48 | 50 | excludes = []
|
49 | 51 |
|
@@ -923,6 +925,12 @@ def run_transformer_int4_gpu_win(repo_id,
|
923 | 925 | trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
|
924 | 926 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
925 | 927 | model = model.to('xpu')
|
| 928 | + elif repo_id in QWENVL_IDS: |
| 929 | + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, |
| 930 | + modules_to_not_convert=['c_fc', 'out_proj'], |
| 931 | + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() |
| 932 | + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
| 933 | + model = model.to('xpu') |
926 | 934 | else:
|
927 | 935 | model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
|
928 | 936 | trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
|
@@ -1038,6 +1046,13 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
|
1038 | 1046 | torch_dtype=torch.float16).eval()
|
1039 | 1047 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
1040 | 1048 | model = model.to('xpu')
|
| 1049 | + elif repo_id in QWENVL_IDS: |
| 1050 | + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, |
| 1051 | + modules_to_not_convert=['c_fc', 'out_proj'], |
| 1052 | + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, |
| 1053 | + torch_dtype=torch.float16).eval() |
| 1054 | + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
| 1055 | + model = model.to('xpu') |
1041 | 1056 | else:
|
1042 | 1057 | model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
|
1043 | 1058 | trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
|
@@ -1149,6 +1164,12 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
|
1149 | 1164 | use_cache=True, cpu_embedding=cpu_embedding).eval()
|
1150 | 1165 | tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
|
1151 | 1166 | model = model.to('xpu')
|
| 1167 | + elif repo_id in QWENVL_IDS: |
| 1168 | + model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, |
| 1169 | + modules_to_not_convert=['c_fc', 'out_proj'], |
| 1170 | + use_cache=True, cpu_embedding=cpu_embedding).eval() |
| 1171 | + tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) |
| 1172 | + model = model.to('xpu') |
1152 | 1173 | else:
|
1153 | 1174 | model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
|
1154 | 1175 | use_cache=True, cpu_embedding=cpu_embedding).eval()
|
@@ -1259,6 +1280,12 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
|
1259 | 1280 | use_cache=True, cpu_embedding=cpu_embedding).eval()
|
1260 | 1281 | tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
|
1261 | 1282 | model = model.half().to('xpu')
|
| 1283 | + elif repo_id in QWENVL_IDS: |
| 1284 | + model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, |
| 1285 | + modules_to_not_convert=['c_fc', 'out_proj'], |
| 1286 | + use_cache=True, cpu_embedding=cpu_embedding).eval() |
| 1287 | + tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) |
| 1288 | + model = model.half().to('xpu') |
1262 | 1289 | else:
|
1263 | 1290 | model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
|
1264 | 1291 | use_cache=True, cpu_embedding=cpu_embedding).eval()
|
|
0 commit comments