Skip to content

Commit efff7f8

Browse files
authored
FEAT: support qwen2.5-vl-instruct (#2788)
1 parent 9eb0fd4 commit efff7f8

File tree

6 files changed

+191
-4
lines changed

6 files changed

+191
-4
lines changed

doc/source/models/builtin/llm/index.rst

+8-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ The following is a list of built-in LLM in Xinference:
133133

134134
* - :ref:`deepseek-r1-distill-qwen <models_llm_deepseek-r1-distill-qwen>`
135135
- chat
136-
- 32768
136+
- 131072
137137
- deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen
138138

139139
* - :ref:`deepseek-v2 <models_llm_deepseek-v2>`
@@ -481,6 +481,11 @@ The following is a list of built-in LLM in Xinference:
481481
- 32768
482482
- Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
483483

484+
* - :ref:`qwen2.5-vl-instruct <models_llm_qwen2.5-vl-instruct>`
485+
- chat, vision
486+
- 128000
487+
- Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.
488+
484489
* - :ref:`qwq-32b-preview <models_llm_qwq-32b-preview>`
485490
- chat
486491
- 32768
@@ -777,6 +782,8 @@ The following is a list of built-in LLM in Xinference:
777782

778783
qwen2.5-instruct
779784

785+
qwen2.5-vl-instruct
786+
780787
qwq-32b-preview
781788

782789
seallm_v2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
.. _models_llm_qwen2.5-vl-instruct:
2+
3+
========================================
4+
qwen2.5-vl-instruct
5+
========================================
6+
7+
- **Context Length:** 128000
8+
- **Model Name:** qwen2.5-vl-instruct
9+
- **Languages:** en, zh
10+
- **Abilities:** chat, vision
11+
- **Description:** Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.
12+
13+
Specifications
14+
^^^^^^^^^^^^^^
15+
16+
17+
Model Spec 1 (pytorch, 3 Billion)
18+
++++++++++++++++++++++++++++++++++++++++
19+
20+
- **Model Format:** pytorch
21+
- **Model Size (in billions):** 3
22+
- **Quantizations:** none
23+
- **Engines**: Transformers
24+
- **Model ID:** Qwen/Qwen2.5-VL-3B-Instruct
25+
- **Model Hubs**: `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-VL-3B-Instruct>`__
26+
27+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
28+
chosen quantization method from the options listed above::
29+
30+
xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 3 --model-format pytorch --quantization ${quantization}
31+
32+
33+
Model Spec 2 (pytorch, 7 Billion)
34+
++++++++++++++++++++++++++++++++++++++++
35+
36+
- **Model Format:** pytorch
37+
- **Model Size (in billions):** 7
38+
- **Quantizations:** none
39+
- **Engines**: Transformers
40+
- **Model ID:** Qwen/Qwen2.5-VL-7B-Instruct
41+
- **Model Hubs**: `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-VL-7B-Instruct>`__
42+
43+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
44+
chosen quantization method from the options listed above::
45+
46+
xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
47+
48+
49+
Model Spec 3 (pytorch, 72 Billion)
50+
++++++++++++++++++++++++++++++++++++++++
51+
52+
- **Model Format:** pytorch
53+
- **Model Size (in billions):** 72
54+
- **Quantizations:** none
55+
- **Engines**: Transformers
56+
- **Model ID:** Qwen/Qwen2.5-VL-72B-Instruct
57+
- **Model Hubs**: `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-VL-72B-Instruct>`__
58+
59+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
60+
chosen quantization method from the options listed above::
61+
62+
xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
63+

xinference/model/llm/llm_family.json

+49
Original file line numberDiff line numberDiff line change
@@ -7125,6 +7125,55 @@
71257125
"<|endoftext|>"
71267126
]
71277127
},
7128+
{
7129+
"version":1,
7130+
"context_length":128000,
7131+
"model_name":"qwen2.5-vl-instruct",
7132+
"model_lang":[
7133+
"en",
7134+
"zh"
7135+
],
7136+
"model_ability":[
7137+
"chat",
7138+
"vision"
7139+
],
7140+
"model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
7141+
"model_specs":[
7142+
{
7143+
"model_format":"pytorch",
7144+
"model_size_in_billions":3,
7145+
"quantizations":[
7146+
"none"
7147+
],
7148+
"model_id":"Qwen/Qwen2.5-VL-3B-Instruct"
7149+
},
7150+
{
7151+
"model_format":"pytorch",
7152+
"model_size_in_billions":7,
7153+
"quantizations":[
7154+
"none"
7155+
],
7156+
"model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
7157+
},
7158+
{
7159+
"model_format":"pytorch",
7160+
"model_size_in_billions":72,
7161+
"quantizations":[
7162+
"none"
7163+
],
7164+
"model_id":"Qwen/Qwen2.5-VL-72B-Instruct"
7165+
}
7166+
],
7167+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
7168+
"stop_token_ids": [
7169+
151645,
7170+
151643
7171+
],
7172+
"stop": [
7173+
"<|im_end|>",
7174+
"<|endoftext|>"
7175+
]
7176+
},
71287177
{
71297178
"version": 1,
71307179
"context_length": 32768,

xinference/model/llm/llm_family_modelscope.json

+52
Original file line numberDiff line numberDiff line change
@@ -4825,6 +4825,58 @@
48254825
"<|endoftext|>"
48264826
]
48274827
},
4828+
{
4829+
"version":1,
4830+
"context_length":128000,
4831+
"model_name":"qwen2.5-vl-instruct",
4832+
"model_lang":[
4833+
"en",
4834+
"zh"
4835+
],
4836+
"model_ability":[
4837+
"chat",
4838+
"vision"
4839+
],
4840+
"model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
4841+
"model_specs":[
4842+
{
4843+
"model_format":"pytorch",
4844+
"model_size_in_billions":3,
4845+
"quantizations":[
4846+
"none"
4847+
],
4848+
"model_hub": "modelscope",
4849+
"model_id":"qwen/Qwen2.5-VL-3B-Instruct"
4850+
},
4851+
{
4852+
"model_format":"pytorch",
4853+
"model_size_in_billions":7,
4854+
"quantizations":[
4855+
"none"
4856+
],
4857+
"model_hub": "modelscope",
4858+
"model_id":"qwen/Qwen2.5-VL-7B-Instruct"
4859+
},
4860+
{
4861+
"model_format":"pytorch",
4862+
"model_size_in_billions":72,
4863+
"quantizations":[
4864+
"none"
4865+
],
4866+
"model_hub": "modelscope",
4867+
"model_id":"qwen/Qwen2.5-VL-72B-Instruct"
4868+
}
4869+
],
4870+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
4871+
"stop_token_ids": [
4872+
151645,
4873+
151643
4874+
],
4875+
"stop": [
4876+
"<|im_end|>",
4877+
"<|endoftext|>"
4878+
]
4879+
},
48284880
{
48294881
"version": 1,
48304882
"context_length": 32768,

xinference/model/llm/transformers/core.py

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
"MiniCPM-V-2.6",
6363
"glm-4v",
6464
"qwen2-vl-instruct",
65+
"qwen2.5-vl-instruct",
6566
"qwen2-audio",
6667
"qwen2-audio-instruct",
6768
"deepseek-v2",

xinference/model/llm/transformers/qwen2_vl.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,20 @@ def match(
4848
llm_family = model_family.model_family or model_family.model_name
4949
if "qwen2-vl-instruct".lower() in llm_family.lower():
5050
return True
51+
if "qwen2.5-vl-instruct".lower() in llm_family.lower():
52+
return True
5153
if "qvq-72b-preview".lower() in llm_family.lower():
5254
return True
5355
return False
5456

5557
def load(self):
5658
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
5759

60+
try:
61+
from transformers import Qwen2_5_VLForConditionalGeneration
62+
except ImportError:
63+
Qwen2_5_VLForConditionalGeneration = None
64+
5865
device = self._pytorch_model_config.get("device", "auto")
5966
device = select_device(device)
6067
self._device = device
@@ -66,8 +73,16 @@ def load(self):
6673
)
6774
self._tokenizer = self._processor.tokenizer
6875
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
76+
llm_family = self.model_family.model_family or self.model_family.model_name
77+
model_cls = (
78+
Qwen2_5_VLForConditionalGeneration
79+
if "qwen2.5" in llm_family
80+
else Qwen2VLForConditionalGeneration
81+
)
82+
if model_cls is None:
83+
raise ImportError("`transformers` version is too old, please upgrade it")
6984
if flash_attn_installed:
70-
self._model = Qwen2VLForConditionalGeneration.from_pretrained(
85+
self._model = model_cls.from_pretrained(
7186
self.model_path,
7287
torch_dtype="bfloat16",
7388
device_map=device,
@@ -76,14 +91,14 @@ def load(self):
7691
).eval()
7792
elif is_npu_available():
7893
# Ascend do not support bf16
79-
self._model = Qwen2VLForConditionalGeneration.from_pretrained(
94+
self._model = model_cls.from_pretrained(
8095
self.model_path,
8196
device_map="auto",
8297
trust_remote_code=True,
8398
torch_dtype="float16",
8499
).eval()
85100
else:
86-
self._model = Qwen2VLForConditionalGeneration.from_pretrained(
101+
self._model = model_cls.from_pretrained(
87102
self.model_path, device_map=device, trust_remote_code=True
88103
).eval()
89104

0 commit comments

Comments
 (0)