FEAT: support qwen2.5-vl-instruct (#2788)

qinxuye · web-flow · commit efff7f86c2f3 · 2025-01-28T09:21:23.000+08:00
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -133,7 +133,7 @@ The following is a list of built-in LLM in Xinference:
 
    * - :ref:`deepseek-r1-distill-qwen <models_llm_deepseek-r1-distill-qwen>`
      - chat
-     - 32768
+     - 131072
      - deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen
 
    * - :ref:`deepseek-v2 <models_llm_deepseek-v2>`
@@ -481,6 +481,11 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
 
+   * - :ref:`qwen2.5-vl-instruct <models_llm_qwen2.5-vl-instruct>`
+     - chat, vision
+     - 128000
+     - Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.
+
    * - :ref:`qwq-32b-preview <models_llm_qwq-32b-preview>`
      - chat
      - 32768
@@ -777,6 +782,8 @@ The following is a list of built-in LLM in Xinference:
   
    qwen2.5-instruct
   
+   qwen2.5-vl-instruct
+  
    qwq-32b-preview
   
    seallm_v2
diff --git a/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst
@@ -0,0 +1,63 @@
+.. _models_llm_qwen2.5-vl-instruct:
+
+========================================
+qwen2.5-vl-instruct
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** qwen2.5-vl-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, vision
+- **Description:** Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 3
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2.5-VL-3B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-VL-3B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 3 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2.5-VL-7B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-VL-7B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2.5-VL-72B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-VL-72B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -7125,6 +7125,55 @@
       "<|endoftext|>"
     ]
   },
+  {
+    "version":1,
+    "context_length":128000,
+    "model_name":"qwen2.5-vl-instruct",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2.5-VL-3B-Instruct"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2.5-VL-72B-Instruct"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -4825,6 +4825,58 @@
       "<|endoftext|>"
     ]
   },
+  {
+    "version":1,
+    "context_length":128000,
+    "model_name":"qwen2.5-vl-instruct",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"qwen/Qwen2.5-VL-3B-Instruct"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"qwen/Qwen2.5-VL-7B-Instruct"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"qwen/Qwen2.5-VL-72B-Instruct"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
@@ -62,6 +62,7 @@
     "MiniCPM-V-2.6",
     "glm-4v",
     "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
     "qwen2-audio",
     "qwen2-audio-instruct",
     "deepseek-v2",
diff --git a/xinference/model/llm/transformers/qwen2_vl.py b/xinference/model/llm/transformers/qwen2_vl.py
@@ -48,13 +48,20 @@ def match(
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
             return True
+        if "qwen2.5-vl-instruct".lower() in llm_family.lower():
+            return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
         return False
 
     def load(self):
         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
+        try:
+            from transformers import Qwen2_5_VLForConditionalGeneration
+        except ImportError:
+            Qwen2_5_VLForConditionalGeneration = None
+
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
         self._device = device
@@ -66,8 +73,16 @@ def load(self):
         )
         self._tokenizer = self._processor.tokenizer
         flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        llm_family = self.model_family.model_family or self.model_family.model_name
+        model_cls = (
+            Qwen2_5_VLForConditionalGeneration
+            if "qwen2.5" in llm_family
+            else Qwen2VLForConditionalGeneration
+        )
+        if model_cls is None:
+            raise ImportError("`transformers` version is too old, please upgrade it")
         if flash_attn_installed:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 torch_dtype="bfloat16",
                 device_map=device,
@@ -76,14 +91,14 @@ def load(self):
             ).eval()
         elif is_npu_available():
             # Ascend do not support bf16
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 device_map="auto",
                 trust_remote_code=True,
                 torch_dtype="float16",
             ).eval()
         else:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path, device_map=device, trust_remote_code=True
             ).eval()