From 56de933d3cc03761898eff66ec83f71b024f83f6 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:51:31 +0800
Subject: [PATCH 01/17] FEAT: support deepseek-v2 and 2.5 (#2292)

Co-authored-by: wuzhaoxin <15667065080@162.com>
---
 xinference/model/llm/__init__.py              |   6 +
 xinference/model/llm/llm_family.json          | 147 ++++++++
 .../model/llm/llm_family_modelscope.json      | 153 ++++++++
 xinference/model/llm/sglang/core.py           |   3 +
 xinference/model/llm/transformers/core.py     |   4 +
 .../model/llm/transformers/deepseek_v2.py     | 340 ++++++++++++++++++
 xinference/model/llm/utils.py                 |  26 ++
 xinference/model/llm/vllm/core.py             |   6 +
 8 files changed, 685 insertions(+)
 create mode 100644 xinference/model/llm/transformers/deepseek_v2.py
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index 1980a4b81f..5a7895eb1a 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -136,6 +136,10 @@ def _install():
     from .transformers.cogvlm2 import CogVLM2Model
     from .transformers.cogvlm2_video import CogVLM2VideoModel
     from .transformers.core import PytorchChatModel, PytorchModel
+    from .transformers.deepseek_v2 import (
+        DeepSeekV2PytorchChatModel,
+        DeepSeekV2PytorchModel,
+    )
     from .transformers.deepseek_vl import DeepSeekVLChatModel
     from .transformers.glm4v import Glm4VModel
     from .transformers.intern_vl import InternVLChatModel
@@ -182,6 +186,8 @@ def _install():
             MiniCPMV25Model,
             MiniCPMV26Model,
             Glm4VModel,
+            DeepSeekV2PytorchModel,
+            DeepSeekV2PytorchChatModel,
         ]
     )
     if OmniLMMModel:  # type: ignore
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 7f428ee005..e997098e65 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -6946,5 +6946,152 @@
       "<s>",
       "</s>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Lite",
+        "model_revision": "604d5664dddd88a0433dbae533b7fe9472482de0"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2",
+        "model_revision": "4461458f186c35188585855f28f77af5661ad489"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        "model_revision": "85864749cd611b4353ce1decdb286193298f64c7"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Chat",
+        "model_revision": "8e3f5f6c2226787e41ba3e9283a06389d178c926"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<｜begin▁of▁sentence｜>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<｜end▁of▁sentence｜>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2-chat-0628",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Chat-0628",
+        "model_revision": "5d09e272c2b223830f4e84359cd9dd047a5d7c78"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<｜begin▁of▁sentence｜>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2.5",
+        "model_revision": "24b08cb750e0c2757de112d2e16327cb21ed4833"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}    {%- if message['role'] == 'system' %}        {% set ns.system_prompt = message['content'] %}    {%- endif %}{%- endfor %}{{'<｜begin▁of▁sentence｜>'}}{{ns.system_prompt}}{%- for message in messages %}    {%- if message['role'] == 'user' %}    {%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}    {%- endif %}    {%- if message['role'] == 'assistant' and message['content'] is none %}        {%- set ns.is_tool = false -%}        {%- for tool in message['tool_calls']%}            {%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}            {%- set ns.is_first = true -%}            {%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}                   {%- endif %}        {%- endfor %}    {%- endif %}    {%- if message['role'] == 'assistant' and message['content'] is not none %}        {%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}        {%- set ns.is_tool = false -%}        {%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}        {%- endif %}    {%- endif %}    {%- if message['role'] == 'tool' %}        {%- set ns.is_tool = true -%}        {%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}        {%- set ns.is_output_first = false %}        {%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}        {%- endif %}    {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index eb24dd8180..f4386e85fa 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4655,5 +4655,158 @@
       "<s>",
       "</s>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Lite",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Chat",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<｜begin▁of▁sentence｜>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<｜end▁of▁sentence｜>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2-chat-0628",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2-Chat-0628",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<｜begin▁of▁sentence｜>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "deepseek-v2.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 236,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-V2.5",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}    {%- if message['role'] == 'system' %}        {% set ns.system_prompt = message['content'] %}    {%- endif %}{%- endfor %}{{'<｜begin▁of▁sentence｜>'}}{{ns.system_prompt}}{%- for message in messages %}    {%- if message['role'] == 'user' %}    {%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}    {%- endif %}    {%- if message['role'] == 'assistant' and message['content'] is none %}        {%- set ns.is_tool = false -%}        {%- for tool in message['tool_calls']%}            {%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}            {%- set ns.is_first = true -%}            {%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}                   {%- endif %}        {%- endfor %}    {%- endif %}    {%- if message['role'] == 'assistant' and message['content'] is not none %}        {%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}        {%- set ns.is_tool = false -%}        {%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}        {%- endif %}    {%- endif %}    {%- if message['role'] == 'tool' %}        {%- set ns.is_tool = true -%}        {%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}        {%- set ns.is_output_first = false %}        {%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}        {%- endif %}    {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "stop_token_ids": [
+      100001
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 578252324d..621b9b0a59 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -82,6 +82,9 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     "mixtral-instruct-v0.1",
     "gemma-it",
     "gemma-2-it",
+    "deepseek-v2.5",
+    "deepseek-v2-chat",
+    "deepseek-v2-chat-0628",
 ]
 
 
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 32419a56f1..a451b7accd 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -65,6 +65,10 @@
     "MiniCPM-V-2.6",
     "glm-4v",
     "qwen2-vl-instruct",
+    "deepseek-v2",
+    "deepseek-v2-chat",
+    "deepseek-v2.5",
+    "deepseek-v2-chat-0628",
 ]
 
 
diff --git a/xinference/model/llm/transformers/deepseek_v2.py b/xinference/model/llm/transformers/deepseek_v2.py
new file mode 100644
index 0000000000..b6ce2b5e04
--- /dev/null
+++ b/xinference/model/llm/transformers/deepseek_v2.py
@@ -0,0 +1,340 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import uuid
+from typing import Dict, Iterator, List, Optional, Union
+
+import torch
+
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    Completion,
+    CompletionChunk,
+    PytorchGenerateConfig,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import (
+    generate_chat_completion,
+    generate_completion,
+    generate_completion_chunk,
+)
+from .core import PytorchChatModel, PytorchModel
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV2PytorchModel(PytorchModel):
+    def _load_model(self, **kwargs):
+        try:
+            from transformers import (
+                AutoModelForCausalLM,
+                AutoTokenizer,
+                GenerationConfig,
+            )
+        except ImportError:
+            error_message = "Failed to import module 'transformers'"
+            installation_guide = [
+                "Please make sure 'transformers' is installed. ",
+                "You can install it by `pip install transformers`\n",
+            ]
+
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            trust_remote_code=kwargs["trust_remote_code"],
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            attn_implementation="eager",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+        model.generation_config = GenerationConfig.from_pretrained(self.model_path)
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        return model, tokenizer
+
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if "deepseek-v2" not in model_family:
+            return False
+        if "generate" not in llm_family.model_ability:
+            return False
+        return True
+
+    def generate(
+        self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        input_tensor = self._tokenizer(prompt, return_tensors="pt")
+        generate_config = self._sanitize_generate_config(generate_config)
+        default_generate_config = self._model.generation_config
+        generate_kwargs = {
+            "input_ids": input_tensor["input_ids"].cuda(),
+            "attention_mask": input_tensor["attention_mask"].cuda(),
+            "temperature": float(
+                generate_config.get("temperature", default_generate_config.temperature)
+            ),
+            "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)),
+            "top_p": float(generate_config.get("top_p", default_generate_config.top_p)),
+            "top_k": int(generate_config.get("top_k", -1)),
+            "max_new_tokens": generate_config.get("max_tokens", 512),
+            "bos_token_id": default_generate_config.bos_token_id,
+            "do_sample": default_generate_config.do_sample,
+            "eos_token_id": default_generate_config.eos_token_id,
+        }
+
+        stream = generate_config.get("stream", False)
+        if stream:
+            return self._generate_stream(generate_kwargs, input_tensor)
+        else:
+            return self._generate(generate_kwargs, input_tensor)
+
+    def _generate(self, generate_kwargs, input_ids) -> Completion:
+        prompt_tokens = len(input_ids[0])
+        logger.info(f"generate_kwargs:{generate_kwargs}")
+        generation_output = self._model.generate(**generate_kwargs)
+        completion_tokens = len(generation_output[0])
+        response = self._tokenizer.decode(
+            generation_output[0], skip_special_tokens=True
+        )
+        return generate_completion(
+            self.model_uid,
+            response,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        )
+
+    def _generate_stream(self, generate_kwargs, input_ids):
+        from threading import Thread
+
+        from transformers import TextIteratorStreamer
+
+        # Initialize the streamer
+        streamer = TextIteratorStreamer(
+            self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
+        )
+        # Define the generation configuration
+        generate_kwargs["streamer"] = streamer
+        # Start the model chat in a separate thread
+        thread = Thread(
+            target=self._model.generate,
+            kwargs=generate_kwargs,
+        )
+        thread.start()
+
+        completion_id = str(uuid.uuid1())
+        prompt_tokens = len(input_ids[0])
+        total_tokens, completion_tokens = 0, 0
+        # Loop through the streamer to get the new text as it is generated
+        for i, new_text in enumerate(streamer):
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
+        )
+
+
+class DeepSeekV2PytorchChatModel(PytorchChatModel):
+    def _load_model(self, **kwargs):
+        try:
+            from transformers import (
+                AutoModelForCausalLM,
+                AutoTokenizer,
+                GenerationConfig,
+            )
+        except ImportError:
+            error_message = "Failed to import module 'transformers'"
+            installation_guide = [
+                "Please make sure 'transformers' is installed. ",
+                "You can install it by `pip install transformers`\n",
+            ]
+
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            trust_remote_code=kwargs["trust_remote_code"],
+        )
+        logger.info(f"kwargs:{kwargs}")
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            attn_implementation="eager",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+        model.generation_config = GenerationConfig.from_pretrained(self.model_path)
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        return model, tokenizer
+
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if "deepseek-v2" not in model_family:
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+
+    def chat(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        assert self.model_family.chat_template is not None
+        full_prompt = self.get_full_context(
+            messages,
+            self.model_family.chat_template,
+            tokenizer=self._tokenizer,
+        )
+        input_tensor = self._tokenizer.encode(
+            full_prompt,
+            padding=False,
+            truncation=False,
+            max_length=None,
+            add_special_tokens=False,
+            return_tensors="pt",
+        )
+
+        generate_config = self._sanitize_generate_config(generate_config)
+        default_generate_config = self._model.generation_config
+        generate_kwargs = {
+            "input_ids": input_tensor.cuda(),
+            "temperature": float(
+                generate_config.get("temperature", default_generate_config.temperature)
+            ),
+            "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)),
+            "top_p": float(generate_config.get("top_p", default_generate_config.top_p)),
+            "top_k": int(generate_config.get("top_k", -1)),
+            "max_new_tokens": generate_config.get("max_tokens", 512),
+            "bos_token_id": default_generate_config.bos_token_id,
+            "do_sample": default_generate_config.do_sample,
+            "eos_token_id": default_generate_config.eos_token_id,
+        }
+
+        stream = generate_config.get("stream", False)
+        stream_options = generate_config.get("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        if stream:
+            chunk = self._generate_stream(generate_kwargs, input_tensor, include_usage)
+            return self._to_chat_completion_chunks(chunk)
+        else:
+            return self._generate(generate_kwargs, input_tensor)
+
+    def _generate(self, generate_kwargs, input_ids) -> ChatCompletion:
+        prompt_tokens = len(input_ids[0])
+        generation_output = self._model.generate(**generate_kwargs)
+        completion_tokens = len(generation_output[0])
+        response = self._tokenizer.decode(
+            generation_output[0][input_ids.shape[1] :], skip_special_tokens=True
+        )
+        return generate_chat_completion(
+            self.model_uid,
+            response,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        )
+
+    def _generate_stream(self, generate_kwargs, input_ids, include_usage):
+        from threading import Thread
+
+        from transformers import TextIteratorStreamer
+
+        # Initialize the streamer
+        streamer = TextIteratorStreamer(
+            self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
+        )
+        # Define the generation configuration
+        generate_kwargs["streamer"] = streamer
+        # Start the model chat in a separate thread
+        thread = Thread(
+            target=self._model.generate,
+            kwargs=generate_kwargs,
+        )
+        thread.start()
+
+        completion_id = str(uuid.uuid1())
+        prompt_tokens = len(input_ids[0])
+        total_tokens, completion_tokens = 0, 0
+        # Loop through the streamer to get the new text as it is generated
+        for i, new_text in enumerate(streamer):
+            completion_tokens = max(completion_tokens, len(streamer.token_cache))
+            total_tokens = prompt_tokens + completion_tokens
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            has_choice=True,
+            has_content=False,
+        )
+
+        if include_usage:
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+                has_choice=False,
+                has_content=False,
+            )
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 0ae802c01c..c5b26027fb 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -549,6 +549,32 @@ def generate_completion_chunk(
     )
 
 
+def generate_completion(
+    model_uid: str,
+    response: str,
+    prompt_tokens=-1,
+    completion_tokens=-1,
+    total_tokens=-1,
+    finish_reason="stop",
+) -> Completion:
+    return Completion(
+        id=str(uuid.uuid1()),
+        object="text_completion",
+        created=int(time.time()),
+        model=model_uid,
+        choices=[
+            CompletionChoice(
+                text=response, index=0, logprobs=None, finish_reason=finish_reason
+            )
+        ],
+        usage=CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        ),
+    )
+
+
 def generate_chat_completion(
     model_uid: str,
     response: str,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 8869f7fb4a..e531769a18 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -149,6 +149,12 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")
 
+if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
+
+
 if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
     VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")

From 42745077b24a2b517e565235756c5ff317f98f77 Mon Sep 17 00:00:00 2001
From: Poet <42093310+LaureatePoet@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:52:47 +0800
Subject: [PATCH 02/17] FEAT: Update Qwen2-VL-Model to support
 flash_attention_2 implementation (#2289)

Co-authored-by: qinxuye <qinxuye@gmail.com>
---
 xinference/model/llm/transformers/qwen2_vl.py | 36 ++++++++++++++++---
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/xinference/model/llm/transformers/qwen2_vl.py b/xinference/model/llm/transformers/qwen2_vl.py
index 6b27a05139..3eccc0c736 100644
--- a/xinference/model/llm/transformers/qwen2_vl.py
+++ b/xinference/model/llm/transformers/qwen2_vl.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import logging
+import sys
 import uuid
 from typing import Iterator, List, Optional, Union
 
@@ -59,9 +61,19 @@ def load(self):
             self.model_path, trust_remote_code=True
         )
         self._tokenizer = self._processor.tokenizer
-        self._model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_path, device_map=device, trust_remote_code=True
-        ).eval()
+        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        if flash_attn_installed:
+            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+                self.model_path,
+                torch_dtype="bfloat16",
+                device_map=device,
+                attn_implementation="flash_attention_2",
+                trust_remote_code=True,
+            ).eval()
+        else:
+            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+                self.model_path, device_map=device, trust_remote_code=True
+            ).eval()
 
     def _transform_messages(
         self,
@@ -177,8 +189,18 @@ def _generate_stream(
             "streamer": streamer,
             **inputs,
         }
-
-        thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
+        error = None
+
+        def model_generate():
+            try:
+                return self._model.generate(**gen_kwargs)
+            except Exception:
+                nonlocal error
+                error = sys.exc_info()
+                streamer.end()
+                raise
+
+        thread = Thread(target=model_generate)
         thread.start()
 
         completion_id = str(uuid.uuid1())
@@ -195,6 +217,10 @@ def _generate_stream(
                 has_content=True,
             )
 
+        if error:
+            _, err, tb = error  # type: ignore
+            raise err.with_traceback(tb)
+
         yield generate_completion_chunk(
             chunk_text=None,
             finish_reason="stop",

From 8f73b0550d1a55328fe165c46ada66dee45abf27 Mon Sep 17 00:00:00 2001
From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com>
Date: Fri, 13 Sep 2024 06:02:31 +0200
Subject: [PATCH 03/17] ENH: Support fish speech 1.4 (#2295)

---
 setup.cfg                                     |   2 +
 xinference/deploy/docker/requirements.txt     |   1 +
 xinference/deploy/docker/requirements_cpu.txt |   1 +
 xinference/model/audio/fish_speech.py         |  14 +-
 xinference/model/audio/model_spec.json        |   6 +-
 .../model/audio/tests/test_fish_speech.py     |   2 +-
 .../fish_speech/configs/firefly_gan_vq.yaml   |   5 +-
 .../configs/text2semantic_finetune.yaml       |   2 +-
 .../fish_speech/i18n/locale/en_US.json        |   2 +-
 .../fish_speech/i18n/locale/es_ES.json        |   2 +-
 .../fish_speech/i18n/locale/ja_JP.json        |   2 +-
 .../fish_speech/i18n/locale/pt_BR.json        |   2 +-
 .../fish_speech/i18n/locale/zh_CN.json        |   2 +-
 .../fish_speech/models/text2semantic/llama.py |   4 +-
 .../fish_speech/models/vqgan/__init__.py      |   3 -
 .../fish_speech/models/vqgan/lit_module.py    | 442 ------------------
 .../models/vqgan/modules/discriminator.py     |  44 --
 .../models/vqgan/modules/firefly.py           | 367 +++++++--------
 .../fish_speech/models/vqgan/modules/fsq.py   |  31 +-
 .../models/vqgan/modules/reference.py         | 115 -----
 .../models/vqgan/modules/wavenet.py           | 225 ---------
 .../fish_speech/fish_speech/text/clean.py     |  56 +--
 .../fish_speech/fish_speech/text/spliter.py   |   4 +-
 .../fish_speech/fish_speech/train.py          |   2 +
 .../fish_speech/fish_speech/webui/manage.py   |  22 +-
 .../thirdparty/fish_speech/tools/api.py       | 213 ++++-----
 .../fish_speech/tools/auto_rerank.py          | 159 -------
 .../thirdparty/fish_speech/tools/commons.py   |  35 ++
 .../fish_speech/tools/download_models.py      |   6 +-
 .../thirdparty/fish_speech/tools/file.py      |  17 +
 .../thirdparty/fish_speech/tools/gen_ref.py   |  36 --
 .../fish_speech/tools/llama/build_dataset.py  |   2 +-
 .../fish_speech/tools/llama/generate.py       |  53 ++-
 .../fish_speech/tools/llama/merge_lora.py     |   2 +-
 .../fish_speech/tools/llama/quantize.py       |   4 +-
 .../fish_speech/tools/merge_asr_files.py      |  55 ---
 .../fish_speech/tools/msgpack_api.py          |  34 ++
 .../thirdparty/fish_speech/tools/post_api.py  | 129 +++--
 .../fish_speech/tools/sensevoice/fun_asr.py   |   2 +-
 .../thirdparty/fish_speech/tools/smart_pad.py |  19 +-
 .../fish_speech/tools/vqgan/extract_vq.py     |   4 +-
 .../fish_speech/tools/vqgan/inference.py      |   6 +-
 .../thirdparty/fish_speech/tools/webui.py     | 158 +------
 43 files changed, 544 insertions(+), 1748 deletions(-)
 delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py
 delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py
 delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py
 delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py
 delete mode 100644 xinference/thirdparty/fish_speech/tools/auto_rerank.py
 create mode 100644 xinference/thirdparty/fish_speech/tools/commons.py
 delete mode 100644 xinference/thirdparty/fish_speech/tools/gen_ref.py
 delete mode 100644 xinference/thirdparty/fish_speech/tools/merge_asr_files.py
 create mode 100644 xinference/thirdparty/fish_speech/tools/msgpack_api.py

diff --git a/setup.cfg b/setup.cfg
index e95ba7ca3a..55f5117c14 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -127,6 +127,7 @@ all =
     loguru  # For Fish Speech
     natsort  # For Fish Speech
     loralib  # For Fish Speech
+    ormsgpack  # For Fish Speech
     qwen-vl-utils # For qwen2-vl
     datamodel_code_generator # for minicpm-4B
     jsonschema # for minicpm-4B
@@ -198,6 +199,7 @@ audio =
     loguru  # For Fish Speech
     natsort  # For Fish Speech
     loralib  # For Fish Speech
+    ormsgpack  # For Fish Speech
 doc =
     ipython>=6.5.0
     sphinx>=3.0.0
diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
index b5ac62c254..d23d72c3f9 100644
--- a/xinference/deploy/docker/requirements.txt
+++ b/xinference/deploy/docker/requirements.txt
@@ -70,6 +70,7 @@ jj-pytorchvideo # For CogVLM2-video
 loguru  # For Fish Speech
 natsort  # For Fish Speech
 loralib  # For Fish Speech
+ormsgpack  # For Fish Speech
 qwen-vl-utils # For qwen2-vl
 datamodel_code_generator # for minicpm-4B
 jsonschema # for minicpm-4B
diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
index cb1d27dc44..493f558da2 100644
--- a/xinference/deploy/docker/requirements_cpu.txt
+++ b/xinference/deploy/docker/requirements_cpu.txt
@@ -65,6 +65,7 @@ jj-pytorchvideo # For CogVLM2-video
 loguru  # For Fish Speech
 natsort  # For Fish Speech
 loralib  # For Fish Speech
+ormsgpack  # For Fish Speech
 qwen-vl-utils # For qwen2-vl
 datamodel_code_generator # for minicpm-4B
 jsonschema # for minicpm-4B
diff --git a/xinference/model/audio/fish_speech.py b/xinference/model/audio/fish_speech.py
index 96766a7d27..4a6412f04a 100644
--- a/xinference/model/audio/fish_speech.py
+++ b/xinference/model/audio/fish_speech.py
@@ -92,7 +92,7 @@ def load(self):
 
         checkpoint_path = os.path.join(
             self._model_path,
-            "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+            "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
         )
         self._model = load_decoder_model(
             config_name="firefly_gan_vq",
@@ -213,12 +213,12 @@ def speech(
                 text=input,
                 enable_reference_audio=False,
                 reference_audio=None,
-                reference_text="",
-                max_new_tokens=0,
-                chunk_length=100,
-                top_p=0.7,
-                repetition_penalty=1.2,
-                temperature=0.7,
+                reference_text=kwargs.get("reference_text", ""),
+                max_new_tokens=kwargs.get("max_new_tokens", 1024),
+                chunk_length=kwargs.get("chunk_length", 200),
+                top_p=kwargs.get("top_p", 0.7),
+                repetition_penalty=kwargs.get("repetition_penalty", 1.2),
+                temperature=kwargs.get("temperature", 0.7),
             )
         )
         sample_rate, audio = result[0][1]
diff --git a/xinference/model/audio/model_spec.json b/xinference/model/audio/model_spec.json
index 6d546a0921..6762d84a18 100644
--- a/xinference/model/audio/model_spec.json
+++ b/xinference/model/audio/model_spec.json
@@ -148,10 +148,10 @@
     "multilingual": true
   },
   {
-    "model_name": "FishSpeech-1.2-SFT",
+    "model_name": "FishSpeech-1.4",
     "model_family": "FishAudio",
-    "model_id": "fishaudio/fish-speech-1.2-sft",
-    "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0",
+    "model_id": "fishaudio/fish-speech-1.4",
+    "model_revision": "3c49651b8e583b6b13f55e375432e0d57e1aa84d",
     "model_ability": "text-to-audio",
     "multilingual": true
   }
diff --git a/xinference/model/audio/tests/test_fish_speech.py b/xinference/model/audio/tests/test_fish_speech.py
index 8b339290ad..ce57566b19 100644
--- a/xinference/model/audio/tests/test_fish_speech.py
+++ b/xinference/model/audio/tests/test_fish_speech.py
@@ -22,7 +22,7 @@ def test_fish_speech(setup):
     client = Client(endpoint)
 
     model_uid = client.launch_model(
-        model_name="FishSpeech-1.2-SFT",
+        model_name="FishSpeech-1.4",
         model_type="audio",
     )
     model = client.get_model(model_uid)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml b/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml
index 7417623b03..10aa8d4a52 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml
+++ b/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml
@@ -22,13 +22,12 @@ head:
   resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
   num_mels: 512
   upsample_initial_channel: 512
-  use_template: false
   pre_conv_kernel_size: 13
   post_conv_kernel_size: 13
 quantizer:
   _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
   input_dim: 512
-  n_groups: 4
+  n_groups: 8
   n_codebooks: 1
   levels: [8, 5, 5, 5]
-  downsample_factor: [2]
+  downsample_factor: [2, 2]
diff --git a/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml b/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml
index 1bf8fd6b6d..f4c1993023 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml
+++ b/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml
@@ -4,7 +4,7 @@ defaults:
 
 project: text2semantic_finetune_dual_ar
 max_length: 4096
-pretrained_ckpt_path: checkpoints/fish-speech-1.2-sft
+pretrained_ckpt_path: checkpoints/fish-speech-1.4
 
 # Lightning Trainer
 trainer:
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json
index cf6ad6ca1e..6e280c236e 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json
@@ -72,7 +72,7 @@
   "Put your text here.": "Put your text here.",
   "Reference Audio": "Reference Audio",
   "Reference Text": "Reference Text",
-  "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
   "Remove Selected Data": "Remove Selected Data",
   "Removed path successfully!": "Removed path successfully!",
   "Repetition Penalty": "Repetition Penalty",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json
index 1ea5988213..3285341f68 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json
@@ -72,7 +72,7 @@
   "Put your text here.": "Ponga su texto aquí.",
   "Reference Audio": "Audio de Referencia",
   "Reference Text": "Texto de Referencia",
-  "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado se publica bajo la Licencia BSD-3-Clause, y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
   "Remove Selected Data": "Eliminar Datos Seleccionados",
   "Removed path successfully!": "¡Ruta eliminada exitosamente!",
   "Repetition Penalty": "Penalización por Repetición",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json
index e7817eb0c5..d30bac7bcd 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json
@@ -72,7 +72,7 @@
   "Put your text here.": "ここにテキストを入力してください。",
   "Reference Audio": "リファレンスオーディオ",
   "Reference Text": "リファレンステキスト",
-  "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "関連コードはBSD-3-Clauseライセンスの下でリリースされ、重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
   "Remove Selected Data": "選択したデータを削除",
   "Removed path successfully!": "パスの削除に成功しました！",
   "Repetition Penalty": "反復ペナルティ",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json
index c3df431a40..385f20272e 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json
@@ -84,7 +84,7 @@
   "Reference Text": "Texto de Referência",
   "warning": "Aviso",
   "Pre-processing begins...": "O pré-processamento começou!",
-  "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado é licenciado sob a Licença BSD-3-Clause, e os pesos sob a Licença CC BY-NC-SA 4.0.",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
   "Remove Selected Data": "Remover Dados Selecionados",
   "Removed path successfully!": "Caminho removido com sucesso!",
   "Repetition Penalty": "Penalidade de Repetição",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json
index da81eef1cf..3dd1a5cd1c 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json
@@ -72,7 +72,7 @@
   "Put your text here.": "在此处输入文本.",
   "Reference Audio": "参考音频",
   "Reference Text": "参考文本",
-  "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
   "Remove Selected Data": "移除选中数据",
   "Removed path successfully!": "移除路径成功!",
   "Repetition Penalty": "重复惩罚",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py b/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py
index 4eef92b0ba..0725dfb9b7 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py
@@ -353,7 +353,7 @@ def from_pretrained(
 
             if "int8" in str(Path(path)):
                 logger.info("Using int8 weight-only quantization!")
-                from ...tools.llama.quantize import WeightOnlyInt8QuantHandler
+                from tools.llama.quantize import WeightOnlyInt8QuantHandler
 
                 simple_quantizer = WeightOnlyInt8QuantHandler(model)
                 model = simple_quantizer.convert_for_runtime()
@@ -363,7 +363,7 @@ def from_pretrained(
                 path_comps = path.name.split("-")
                 assert path_comps[-2].startswith("g")
                 groupsize = int(path_comps[-2][1:])
-                from ...tools.llama.quantize import WeightOnlyInt4QuantHandler
+                from tools.llama.quantize import WeightOnlyInt4QuantHandler
 
                 simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
                 model = simple_quantizer.convert_for_runtime()
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py
index 401c6df468..e69de29bb2 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py
@@ -1,3 +0,0 @@
-from .lit_module import VQGAN
-
-__all__ = ["VQGAN"]
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py
deleted file mode 100644
index d5fa2ccabb..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py
+++ /dev/null
@@ -1,442 +0,0 @@
-import itertools
-import math
-from typing import Any, Callable
-
-import lightning as L
-import torch
-import torch.nn.functional as F
-# import wandb
-from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
-from matplotlib import pyplot as plt
-from torch import nn
-
-from fish_speech.models.vqgan.modules.discriminator import Discriminator
-from fish_speech.models.vqgan.modules.wavenet import WaveNet
-from fish_speech.models.vqgan.utils import avg_with_mask, plot_mel, sequence_mask
-
-
-class VQGAN(L.LightningModule):
-    def __init__(
-        self,
-        optimizer: Callable,
-        lr_scheduler: Callable,
-        encoder: WaveNet,
-        quantizer: nn.Module,
-        decoder: WaveNet,
-        discriminator: Discriminator,
-        vocoder: nn.Module,
-        encode_mel_transform: nn.Module,
-        gt_mel_transform: nn.Module,
-        weight_adv: float = 1.0,
-        weight_vq: float = 1.0,
-        weight_mel: float = 1.0,
-        sampling_rate: int = 44100,
-        freeze_encoder: bool = False,
-    ):
-        super().__init__()
-
-        # Model parameters
-        self.optimizer_builder = optimizer
-        self.lr_scheduler_builder = lr_scheduler
-
-        # Modules
-        self.encoder = encoder
-        self.quantizer = quantizer
-        self.decoder = decoder
-        self.vocoder = vocoder
-        self.discriminator = discriminator
-        self.encode_mel_transform = encode_mel_transform
-        self.gt_mel_transform = gt_mel_transform
-
-        # A simple linear layer to project quality to condition channels
-        self.quality_projection = nn.Linear(1, 768)
-
-        # Freeze vocoder
-        for param in self.vocoder.parameters():
-            param.requires_grad = False
-
-        # Loss weights
-        self.weight_adv = weight_adv
-        self.weight_vq = weight_vq
-        self.weight_mel = weight_mel
-
-        # Other parameters
-        self.sampling_rate = sampling_rate
-
-        # Disable strict loading
-        self.strict_loading = False
-
-        # If encoder is frozen
-        if freeze_encoder:
-            for param in self.encoder.parameters():
-                param.requires_grad = False
-
-            for param in self.quantizer.parameters():
-                param.requires_grad = False
-
-        self.automatic_optimization = False
-
-    def on_save_checkpoint(self, checkpoint):
-        # Do not save vocoder
-        state_dict = checkpoint["state_dict"]
-        for name in list(state_dict.keys()):
-            if "vocoder" in name:
-                state_dict.pop(name)
-
-    def configure_optimizers(self):
-        optimizer_generator = self.optimizer_builder(
-            itertools.chain(
-                self.encoder.parameters(),
-                self.quantizer.parameters(),
-                self.decoder.parameters(),
-                self.quality_projection.parameters(),
-            )
-        )
-        optimizer_discriminator = self.optimizer_builder(
-            self.discriminator.parameters()
-        )
-
-        lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
-        lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
-
-        return (
-            {
-                "optimizer": optimizer_generator,
-                "lr_scheduler": {
-                    "scheduler": lr_scheduler_generator,
-                    "interval": "step",
-                    "name": "optimizer/generator",
-                },
-            },
-            {
-                "optimizer": optimizer_discriminator,
-                "lr_scheduler": {
-                    "scheduler": lr_scheduler_discriminator,
-                    "interval": "step",
-                    "name": "optimizer/discriminator",
-                },
-            },
-        )
-
-    def training_step(self, batch, batch_idx):
-        optim_g, optim_d = self.optimizers()
-
-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
-
-        audios = audios.float()
-        audios = audios[:, None, :]
-
-        with torch.no_grad():
-            encoded_mels = self.encode_mel_transform(audios)
-            gt_mels = self.gt_mel_transform(audios)
-            quality = ((gt_mels.mean(-1) > -8).sum(-1) - 90) / 10
-            quality = quality.unsqueeze(-1)
-
-        mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        gt_mels = gt_mels * mel_masks_float_conv
-        encoded_mels = encoded_mels * mel_masks_float_conv
-
-        # Encode
-        encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
-
-        # Quantize
-        vq_result = self.quantizer(encoded_features)
-        loss_vq = getattr("vq_result", "loss", 0.0)
-        vq_recon_features = vq_result.z * mel_masks_float_conv
-        vq_recon_features = (
-            vq_recon_features + self.quality_projection(quality)[:, :, None]
-        )
-
-        # VQ Decode
-        gen_mel = (
-            self.decoder(
-                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
-                condition=vq_recon_features,
-            )
-            * mel_masks_float_conv
-        )
-
-        # Discriminator
-        real_logits = self.discriminator(gt_mels)
-        fake_logits = self.discriminator(gen_mel.detach())
-        d_mask = F.interpolate(
-            mel_masks_float_conv, size=(real_logits.shape[2],), mode="nearest"
-        )
-
-        loss_real = avg_with_mask((real_logits - 1) ** 2, d_mask)
-        loss_fake = avg_with_mask(fake_logits**2, d_mask)
-
-        loss_d = loss_real + loss_fake
-
-        self.log(
-            "train/discriminator/loss",
-            loss_d,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=True,
-            logger=True,
-        )
-
-        # Discriminator backward
-        optim_d.zero_grad()
-        self.manual_backward(loss_d)
-        self.clip_gradients(
-            optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
-        )
-        optim_d.step()
-
-        # Mel Loss, applying l1, using a weighted sum
-        mel_distance = (
-            gen_mel - gt_mels
-        ).abs()  # * 0.5 + self.ssim(gen_mel, gt_mels) * 0.5
-        loss_mel_low_freq = avg_with_mask(mel_distance[:, :40, :], mel_masks_float_conv)
-        loss_mel_mid_freq = avg_with_mask(
-            mel_distance[:, 40:70, :], mel_masks_float_conv
-        )
-        loss_mel_high_freq = avg_with_mask(
-            mel_distance[:, 70:, :], mel_masks_float_conv
-        )
-        loss_mel = (
-            loss_mel_low_freq * 0.6 + loss_mel_mid_freq * 0.3 + loss_mel_high_freq * 0.1
-        )
-
-        # Adversarial Loss
-        fake_logits = self.discriminator(gen_mel)
-        loss_adv = avg_with_mask((fake_logits - 1) ** 2, d_mask)
-
-        # Total loss
-        loss = (
-            self.weight_vq * loss_vq
-            + self.weight_mel * loss_mel
-            + self.weight_adv * loss_adv
-        )
-
-        # Log losses
-        self.log(
-            "train/generator/loss",
-            loss,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=True,
-            logger=True,
-        )
-        self.log(
-            "train/generator/loss_vq",
-            loss_vq,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=False,
-            logger=True,
-        )
-        self.log(
-            "train/generator/loss_mel",
-            loss_mel,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=False,
-            logger=True,
-        )
-        self.log(
-            "train/generator/loss_adv",
-            loss_adv,
-            on_step=True,
-            on_epoch=False,
-            prog_bar=False,
-            logger=True,
-        )
-
-        # Generator backward
-        optim_g.zero_grad()
-        self.manual_backward(loss)
-        self.clip_gradients(
-            optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
-        )
-        optim_g.step()
-
-        scheduler_g, scheduler_d = self.lr_schedulers()
-        scheduler_g.step()
-        scheduler_d.step()
-
-    def validation_step(self, batch: Any, batch_idx: int):
-        audios, audio_lengths = batch["audios"], batch["audio_lengths"]
-
-        audios = audios.float()
-        audios = audios[:, None, :]
-
-        encoded_mels = self.encode_mel_transform(audios)
-        gt_mels = self.gt_mel_transform(audios)
-
-        mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        gt_mels = gt_mels * mel_masks_float_conv
-        encoded_mels = encoded_mels * mel_masks_float_conv
-
-        # Encode
-        encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
-
-        # Quantize
-        vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv
-        vq_recon_features = (
-            vq_recon_features
-            + self.quality_projection(
-                torch.ones(
-                    vq_recon_features.shape[0], 1, device=vq_recon_features.device
-                )
-                * 2
-            )[:, :, None]
-        )
-
-        # VQ Decode
-        gen_aux_mels = (
-            self.decoder(
-                torch.randn_like(vq_recon_features) * mel_masks_float_conv,
-                condition=vq_recon_features,
-            )
-            * mel_masks_float_conv
-        )
-        loss_mel = avg_with_mask((gen_aux_mels - gt_mels).abs(), mel_masks_float_conv)
-
-        self.log(
-            "val/loss_mel",
-            loss_mel,
-            on_step=False,
-            on_epoch=True,
-            prog_bar=False,
-            logger=True,
-            sync_dist=True,
-        )
-
-        recon_audios = self.vocoder(gt_mels)
-        gen_aux_audios = self.vocoder(gen_aux_mels)
-
-        # only log the first batch
-        if batch_idx != 0:
-            return
-
-        for idx, (
-            gt_mel,
-            gen_aux_mel,
-            audio,
-            gen_aux_audio,
-            recon_audio,
-            audio_len,
-        ) in enumerate(
-            zip(
-                gt_mels,
-                gen_aux_mels,
-                audios.cpu().float(),
-                gen_aux_audios.cpu().float(),
-                recon_audios.cpu().float(),
-                audio_lengths,
-            )
-        ):
-            if idx > 4:
-                break
-
-            mel_len = audio_len // self.gt_mel_transform.hop_length
-
-            image_mels = plot_mel(
-                [
-                    gt_mel[:, :mel_len],
-                    gen_aux_mel[:, :mel_len],
-                ],
-                [
-                    "Ground-Truth",
-                    "Auxiliary",
-                ],
-            )
-
-            if isinstance(self.logger, WandbLogger):
-                self.logger.experiment.log(
-                    {
-                        "reconstruction_mel": wandb.Image(image_mels, caption="mels"),
-                        "wavs": [
-                            wandb.Audio(
-                                audio[0, :audio_len],
-                                sample_rate=self.sampling_rate,
-                                caption="gt",
-                            ),
-                            wandb.Audio(
-                                gen_aux_audio[0, :audio_len],
-                                sample_rate=self.sampling_rate,
-                                caption="aux",
-                            ),
-                            wandb.Audio(
-                                recon_audio[0, :audio_len],
-                                sample_rate=self.sampling_rate,
-                                caption="recon",
-                            ),
-                        ],
-                    },
-                )
-
-            if isinstance(self.logger, TensorBoardLogger):
-                self.logger.experiment.add_figure(
-                    f"sample-{idx}/mels",
-                    image_mels,
-                    global_step=self.global_step,
-                )
-                self.logger.experiment.add_audio(
-                    f"sample-{idx}/wavs/gt",
-                    audio[0, :audio_len],
-                    self.global_step,
-                    sample_rate=self.sampling_rate,
-                )
-                self.logger.experiment.add_audio(
-                    f"sample-{idx}/wavs/gen",
-                    gen_aux_audio[0, :audio_len],
-                    self.global_step,
-                    sample_rate=self.sampling_rate,
-                )
-                self.logger.experiment.add_audio(
-                    f"sample-{idx}/wavs/recon",
-                    recon_audio[0, :audio_len],
-                    self.global_step,
-                    sample_rate=self.sampling_rate,
-                )
-
-            plt.close(image_mels)
-
-    def encode(self, audios, audio_lengths):
-        audios = audios.float()
-
-        mels = self.encode_mel_transform(audios)
-        mel_lengths = audio_lengths // self.encode_mel_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        mels = mels * mel_masks_float_conv
-
-        # Encode
-        encoded_features = self.encoder(mels) * mel_masks_float_conv
-        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
-
-        return self.quantizer.encode(encoded_features), feature_lengths
-
-    def decode(self, indices, feature_lengths, return_audios=False):
-        factor = math.prod(self.quantizer.downsample_factor)
-        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-
-        z = self.quantizer.decode(indices) * mel_masks_float_conv
-        z = (
-            z
-            + self.quality_projection(torch.ones(z.shape[0], 1, device=z.device) * 2)[
-                :, :, None
-            ]
-        )
-
-        gen_mel = (
-            self.decoder(
-                torch.randn_like(z) * mel_masks_float_conv,
-                condition=z,
-            )
-            * mel_masks_float_conv
-        )
-
-        if return_audios:
-            return self.vocoder(gen_mel)
-
-        return gen_mel
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py
deleted file mode 100644
index 69c7df4103..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from torch import nn
-from torch.nn.utils.parametrizations import weight_norm
-
-
-class Discriminator(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-        blocks = []
-        convs = [
-            (1, 64, (3, 9), 1, (1, 4)),
-            (64, 128, (3, 9), (1, 2), (1, 4)),
-            (128, 256, (3, 9), (1, 2), (1, 4)),
-            (256, 512, (3, 9), (1, 2), (1, 4)),
-            (512, 1024, (3, 3), 1, (1, 1)),
-            (1024, 1, (3, 3), 1, (1, 1)),
-        ]
-
-        for idx, (in_channels, out_channels, kernel_size, stride, padding) in enumerate(
-            convs
-        ):
-            blocks.append(
-                weight_norm(
-                    nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
-                )
-            )
-
-            if idx != len(convs) - 1:
-                blocks.append(nn.SiLU(inplace=True))
-
-        self.blocks = nn.Sequential(*blocks)
-
-    def forward(self, x):
-        return self.blocks(x[:, None])[:, 0]
-
-
-if __name__ == "__main__":
-    model = Discriminator()
-    print(sum(p.numel() for p in model.parameters()) / 1_000_000)
-    x = torch.randn(1, 128, 1024)
-    y = model(x)
-    print(y.shape)
-    print(y)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py
index 4ca0ff5882..aa21839b54 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py
@@ -1,25 +1,26 @@
-# A inference only version of the FireflyGAN model
-
 import math
 from functools import partial
 from math import prod
 from typing import Callable
 
-import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.nn import Conv1d
 from torch.nn.utils.parametrizations import weight_norm
 from torch.nn.utils.parametrize import remove_parametrizations
 from torch.utils.checkpoint import checkpoint
 
-from fish_speech.models.vqgan.utils import sequence_mask
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
 
 
 def init_weights(m, mean=0.0, std=0.01):
     classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
+    if classname.find("Conv1D") != -1:
         m.weight.data.normal_(mean, std)
 
 
@@ -27,78 +28,141 @@ def get_padding(kernel_size, dilation=1):
     return (kernel_size * dilation - dilation) // 2
 
 
+def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+
+
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+
+
+def pad1d(
+    x: torch.Tensor,
+    paddings: tuple[int, int],
+    mode: str = "zeros",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right
+    before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+
+
+class FishConvNet(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1
+    ):
+        super(FishConvNet, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.stride = stride
+        self.kernel_size = (kernel_size - 1) * dilation + 1
+        self.dilation = dilation
+
+    def forward(self, x):
+        pad = self.kernel_size - self.stride
+        extra_padding = get_extra_padding_for_conv1d(
+            x, self.kernel_size, self.stride, pad
+        )
+        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
+        return self.conv(x).contiguous()
+
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+
+    def remove_weight_norm(self):
+        self.conv = remove_parametrizations(self.conv)
+        return self
+
+
+class FishTransConvNet(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
+        super(FishTransConvNet, self).__init__()
+        self.conv = nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
+        )
+        self.stride = stride
+        self.kernel_size = kernel_size
+
+    def forward(self, x):
+        x = self.conv(x)
+        pad = self.kernel_size - self.stride
+        padding_right = math.ceil(pad)
+        padding_left = pad - padding_right
+        x = unpad1d(x, (padding_left, padding_right))
+        return x.contiguous()
+
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+
+    def remove_weight_norm(self):
+        self.conv = remove_parametrizations(self.conv)
+        return self
+
+
 class ResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
         super().__init__()
 
         self.convs1 = nn.ModuleList(
             [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
+                ).weight_norm(),
             ]
         )
         self.convs1.apply(init_weights)
 
         self.convs2 = nn.ModuleList(
             [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
+                ).weight_norm(),
+                FishConvNet(
+                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
+                ).weight_norm(),
             ]
         )
         self.convs2.apply(init_weights)
@@ -119,7 +183,7 @@ def remove_parametrizations(self):
             remove_parametrizations(conv, tensor_name="weight")
 
 
-class ParralelBlock(nn.Module):
+class ParallelBlock(nn.Module):
     def __init__(
         self,
         channels: int,
@@ -153,7 +217,6 @@ def __init__(
         resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
         num_mels: int = 128,
         upsample_initial_channel: int = 512,
-        use_template: bool = True,
         pre_conv_kernel_size: int = 7,
         post_conv_kernel_size: int = 7,
         post_activation: Callable = partial(nn.SiLU, inplace=True),
@@ -164,85 +227,51 @@ def __init__(
             prod(upsample_rates) == hop_length
         ), f"hop_length must be {prod(upsample_rates)}"
 
-        self.conv_pre = weight_norm(
-            nn.Conv1d(
-                num_mels,
-                upsample_initial_channel,
-                pre_conv_kernel_size,
-                1,
-                padding=get_padding(pre_conv_kernel_size),
-            )
-        )
+        self.conv_pre = FishConvNet(
+            num_mels,
+            upsample_initial_channel,
+            pre_conv_kernel_size,
+            stride=1,
+        ).weight_norm()
 
         self.num_upsamples = len(upsample_rates)
         self.num_kernels = len(resblock_kernel_sizes)
 
         self.noise_convs = nn.ModuleList()
-        self.use_template = use_template
         self.ups = nn.ModuleList()
 
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            c_cur = upsample_initial_channel // (2 ** (i + 1))
             self.ups.append(
-                weight_norm(
-                    nn.ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
+                FishTransConvNet(
+                    upsample_initial_channel // (2**i),
+                    upsample_initial_channel // (2 ** (i + 1)),
+                    k,
+                    stride=u,
+                ).weight_norm()
             )
 
-            if not use_template:
-                continue
-
-            if i + 1 < len(upsample_rates):
-                stride_f0 = np.prod(upsample_rates[i + 1 :])
-                self.noise_convs.append(
-                    Conv1d(
-                        1,
-                        c_cur,
-                        kernel_size=stride_f0 * 2,
-                        stride=stride_f0,
-                        padding=stride_f0 // 2,
-                    )
-                )
-            else:
-                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
-
         self.resblocks = nn.ModuleList()
         for i in range(len(self.ups)):
             ch = upsample_initial_channel // (2 ** (i + 1))
             self.resblocks.append(
-                ParralelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
+                ParallelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
             )
 
         self.activation_post = post_activation()
-        self.conv_post = weight_norm(
-            nn.Conv1d(
-                ch,
-                1,
-                post_conv_kernel_size,
-                1,
-                padding=get_padding(post_conv_kernel_size),
-            )
-        )
+        self.conv_post = FishConvNet(
+            ch, 1, post_conv_kernel_size, stride=1
+        ).weight_norm()
         self.ups.apply(init_weights)
         self.conv_post.apply(init_weights)
 
-    def forward(self, x, template=None):
+    def forward(self, x):
         x = self.conv_pre(x)
 
         for i in range(self.num_upsamples):
             x = F.silu(x, inplace=True)
             x = self.ups[i](x)
 
-            if self.use_template:
-                x = x + self.noise_convs[i](template)
-
-            if self.training:
+            if self.training and self.checkpointing:
                 x = checkpoint(
                     self.resblocks[i],
                     x,
@@ -364,11 +393,11 @@ def __init__(
     ):
         super().__init__()
 
-        self.dwconv = nn.Conv1d(
+        self.dwconv = FishConvNet(
             dim,
             dim,
             kernel_size=kernel_size,
-            padding=int(dilation * (kernel_size - 1) / 2),
+            # padding=int(dilation * (kernel_size - 1) / 2),
             groups=dim,
         )  # depthwise conv
         self.norm = LayerNorm(dim, eps=1e-6)
@@ -421,12 +450,13 @@ def __init__(
 
         self.downsample_layers = nn.ModuleList()
         stem = nn.Sequential(
-            nn.Conv1d(
+            FishConvNet(
                 input_channels,
                 dims[0],
-                kernel_size=kernel_size,
-                padding=kernel_size // 2,
-                padding_mode="zeros",
+                kernel_size=7,
+                # padding=3,
+                # padding_mode="replicate",
+                # padding_mode="zeros",
             ),
             LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
         )
@@ -491,6 +521,7 @@ def __init__(
         self.head = head
         self.quantizer = quantizer
         self.spec_transform = spec_transform
+        self.downsample_factor = math.prod(self.quantizer.downsample_factor)
 
     def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
         if self.spec_transform is not None:
@@ -512,7 +543,7 @@ def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
         if x.ndim == 2:
             x = x[:, None, :]
 
-        if self.quantizer is not None:
+        if self.vq is not None:
             return x, vq_result
 
         return x
@@ -528,25 +559,30 @@ def encode(self, audios, audio_lengths):
 
         # Encode
         encoded_features = self.backbone(mels) * mel_masks_float_conv
-        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
+        feature_lengths = mel_lengths // self.downsample_factor
 
         return self.quantizer.encode(encoded_features), feature_lengths
 
     def decode(self, indices, feature_lengths) -> torch.Tensor:
-        factor = math.prod(self.quantizer.downsample_factor)
-        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
+        mel_masks = sequence_mask(
+            feature_lengths * self.downsample_factor,
+            indices.shape[2] * self.downsample_factor,
+        )
         mel_masks_float_conv = mel_masks[:, None, :].float()
+        audio_lengths = (
+            feature_lengths * self.downsample_factor * self.spec_transform.hop_length
+        )
 
         audio_masks = sequence_mask(
-            feature_lengths * factor * self.spec_transform.hop_length,
-            indices.shape[2] * factor * self.spec_transform.hop_length,
+            audio_lengths,
+            indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length,
         )
         audio_masks_float_conv = audio_masks[:, None, :].float()
 
         z = self.quantizer.decode(indices) * mel_masks_float_conv
         x = self.head(z) * audio_masks_float_conv
 
-        return x
+        return x, audio_lengths
 
     def remove_parametrizations(self):
         if hasattr(self.backbone, "remove_parametrizations"):
@@ -558,68 +594,3 @@ def remove_parametrizations(self):
     @property
     def device(self):
         return next(self.parameters()).device
-
-
-class FireflyBase(nn.Module):
-    def __init__(self, ckpt_path: str = None, pretrained: bool = True):
-        super().__init__()
-
-        self.backbone = ConvNeXtEncoder(
-            input_channels=128,
-            depths=[3, 3, 9, 3],
-            dims=[128, 256, 384, 512],
-            drop_path_rate=0.2,
-            kernel_size=7,
-        )
-
-        self.head = HiFiGANGenerator(
-            hop_length=512,
-            upsample_rates=[8, 8, 2, 2, 2],
-            upsample_kernel_sizes=[16, 16, 4, 4, 4],
-            resblock_kernel_sizes=[3, 7, 11],
-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-            num_mels=512,
-            upsample_initial_channel=512,
-            use_template=False,
-            pre_conv_kernel_size=13,
-            post_conv_kernel_size=13,
-        )
-
-        if ckpt_path is not None:
-            state_dict = torch.load(ckpt_path, map_location="cpu")
-        elif pretrained:
-            state_dict = torch.hub.load_state_dict_from_url(
-                "https://github.com/fishaudio/vocoder/releases/download/1.0.0/firefly-gan-base-generator.ckpt",
-                map_location="cpu",
-                model_dir="checkpoints",
-            )
-
-        if "state_dict" in state_dict:
-            state_dict = state_dict["state_dict"]
-
-        if any("generator." in k for k in state_dict):
-            state_dict = {
-                k.replace("generator.", ""): v
-                for k, v in state_dict.items()
-                if "generator." in k
-            }
-
-        self.load_state_dict(state_dict, strict=True)
-        self.head.remove_parametrizations()
-
-    @torch.no_grad()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.backbone(x)
-        x = self.head(x)
-        if x.ndim == 2:
-            x = x[:, None, :]
-        return x
-
-
-if __name__ == "__main__":
-    model = FireflyBase()
-    model.eval()
-    x = torch.randn(1, 128, 128)
-    with torch.no_grad():
-        y = model(x)
-    print(y.shape)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py
index c837d6aee5..7ea4853376 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py
@@ -6,7 +6,7 @@
 from einops import rearrange
 from vector_quantize_pytorch import GroupedResidualFSQ
 
-from .firefly import ConvNeXtBlock
+from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet
 
 
 @dataclass
@@ -20,7 +20,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
     def __init__(
         self,
         input_dim: int = 512,
-        n_codebooks: int = 1,
+        n_codebooks: int = 9,
         n_groups: int = 1,
         levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
         downsample_factor: tuple[int] = (2, 2),
@@ -46,7 +46,7 @@ def __init__(
         self.downsample = nn.Sequential(
             *[
                 nn.Sequential(
-                    nn.Conv1d(
+                    FishConvNet(
                         all_dims[idx],
                         all_dims[idx + 1],
                         kernel_size=factor,
@@ -61,7 +61,7 @@ def __init__(
         self.upsample = nn.Sequential(
             *[
                 nn.Sequential(
-                    nn.ConvTranspose1d(
+                    FishTransConvNet(
                         all_dims[idx + 1],
                         all_dims[idx],
                         kernel_size=factor,
@@ -114,26 +114,3 @@ def decode(self, indices: torch.Tensor):
         z_q = self.residual_fsq.get_output_from_indices(indices)
         z_q = self.upsample(z_q.mT)
         return z_q
-
-    # def from_latents(self, latents: torch.Tensor):
-    #     z_q, z_p, codes = super().from_latents(latents)
-    #     z_q = self.upsample(z_q)
-    #     return z_q, z_p, codes
-
-
-if __name__ == "__main__":
-    rvq = DownsampleFiniteScalarQuantize(
-        n_codebooks=1,
-        downsample_factor=(2, 2),
-    )
-    x = torch.randn(16, 512, 80)
-
-    result = rvq(x)
-    print(rvq)
-    print(result.latents.shape, result.codes.shape, result.z.shape)
-
-    # y = rvq.from_codes(result.codes)
-    # print(y[0].shape)
-
-    # y = rvq.from_latents(result.latents)
-    # print(y[0].shape)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py
deleted file mode 100644
index 0d9c8c8359..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from fish_speech.utils import autocast_exclude_mps
-
-from .wavenet import WaveNet
-
-
-class ReferenceEncoder(WaveNet):
-    def __init__(
-        self,
-        input_channels: Optional[int] = None,
-        output_channels: Optional[int] = None,
-        residual_channels: int = 512,
-        residual_layers: int = 20,
-        dilation_cycle: Optional[int] = 4,
-        num_heads: int = 8,
-        latent_len: int = 4,
-    ):
-        super().__init__(
-            input_channels=input_channels,
-            residual_channels=residual_channels,
-            residual_layers=residual_layers,
-            dilation_cycle=dilation_cycle,
-        )
-
-        self.head_dim = residual_channels // num_heads
-        self.num_heads = num_heads
-
-        self.latent_len = latent_len
-        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, residual_channels))
-
-        self.q = nn.Linear(residual_channels, residual_channels, bias=True)
-        self.kv = nn.Linear(residual_channels, residual_channels * 2, bias=True)
-        self.q_norm = nn.LayerNorm(self.head_dim)
-        self.k_norm = nn.LayerNorm(self.head_dim)
-        self.proj = nn.Linear(residual_channels, residual_channels)
-        self.proj_drop = nn.Dropout(0.1)
-
-        self.norm = nn.LayerNorm(residual_channels)
-        self.mlp = nn.Sequential(
-            nn.Linear(residual_channels, residual_channels * 4),
-            nn.SiLU(),
-            nn.Linear(residual_channels * 4, residual_channels),
-        )
-        self.output_projection_attn = nn.Linear(residual_channels, output_channels)
-
-        torch.nn.init.trunc_normal_(self.latent, std=0.02)
-        self.apply(self.init_weights)
-
-    def init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            torch.nn.init.trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                torch.nn.init.constant_(m.bias, 0)
-
-    def forward(self, x, attn_mask=None):
-        x = super().forward(x).mT
-        B, N, C = x.shape
-
-        # Calculate mask
-        if attn_mask is not None:
-            assert attn_mask.shape == (B, N) and attn_mask.dtype == torch.bool
-
-            attn_mask = attn_mask[:, None, None, :].expand(
-                B, self.num_heads, self.latent_len, N
-            )
-
-        q_latent = self.latent.expand(B, -1, -1)
-        q = (
-            self.q(q_latent)
-            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
-            .transpose(1, 2)
-        )
-
-        kv = (
-            self.kv(x)
-            .reshape(B, N, 2, self.num_heads, self.head_dim)
-            .permute(2, 0, 3, 1, 4)
-        )
-        k, v = kv.unbind(0)
-
-        q, k = self.q_norm(q), self.k_norm(k)
-        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
-
-        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-
-        x = x + self.mlp(self.norm(x))
-        x = self.output_projection_attn(x)
-        x = x.mean(1)
-
-        return x
-
-
-if __name__ == "__main__":
-    with autocast_exclude_mps(device_type="cpu", dtype=torch.bfloat16):
-        model = ReferenceEncoder(
-            input_channels=128,
-            output_channels=64,
-            residual_channels=384,
-            residual_layers=20,
-            dilation_cycle=4,
-            num_heads=8,
-        )
-        x = torch.randn(4, 128, 64)
-        mask = torch.ones(4, 64, dtype=torch.bool)
-        y = model(x, mask)
-        print(y.shape)
-        loss = F.mse_loss(y, torch.randn(4, 64))
-        loss.backward()
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py
deleted file mode 100644
index e7cc011c3e..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import math
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-
-class Mish(nn.Module):
-    def forward(self, x):
-        return x * torch.tanh(F.softplus(x))
-
-
-class DiffusionEmbedding(nn.Module):
-    """Diffusion Step Embedding"""
-
-    def __init__(self, d_denoiser):
-        super(DiffusionEmbedding, self).__init__()
-        self.dim = d_denoiser
-
-    def forward(self, x):
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
-        emb = x[:, None] * emb[None, :]
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-
-
-class LinearNorm(nn.Module):
-    """LinearNorm Projection"""
-
-    def __init__(self, in_features, out_features, bias=False):
-        super(LinearNorm, self).__init__()
-        self.linear = nn.Linear(in_features, out_features, bias)
-
-        nn.init.xavier_uniform_(self.linear.weight)
-        if bias:
-            nn.init.constant_(self.linear.bias, 0.0)
-
-    def forward(self, x):
-        x = self.linear(x)
-        return x
-
-
-class ConvNorm(nn.Module):
-    """1D Convolution"""
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=1,
-        stride=1,
-        padding=None,
-        dilation=1,
-        bias=True,
-        w_init_gain="linear",
-    ):
-        super(ConvNorm, self).__init__()
-
-        if padding is None:
-            assert kernel_size % 2 == 1
-            padding = int(dilation * (kernel_size - 1) / 2)
-
-        self.conv = nn.Conv1d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias,
-        )
-        nn.init.kaiming_normal_(self.conv.weight)
-
-    def forward(self, signal):
-        conv_signal = self.conv(signal)
-
-        return conv_signal
-
-
-class ResidualBlock(nn.Module):
-    """Residual Block"""
-
-    def __init__(
-        self,
-        residual_channels,
-        use_linear_bias=False,
-        dilation=1,
-        condition_channels=None,
-    ):
-        super(ResidualBlock, self).__init__()
-        self.conv_layer = ConvNorm(
-            residual_channels,
-            2 * residual_channels,
-            kernel_size=3,
-            stride=1,
-            padding=dilation,
-            dilation=dilation,
-        )
-
-        if condition_channels is not None:
-            self.diffusion_projection = LinearNorm(
-                residual_channels, residual_channels, use_linear_bias
-            )
-            self.condition_projection = ConvNorm(
-                condition_channels, 2 * residual_channels, kernel_size=1
-            )
-
-        self.output_projection = ConvNorm(
-            residual_channels, 2 * residual_channels, kernel_size=1
-        )
-
-    def forward(self, x, condition=None, diffusion_step=None):
-        y = x
-
-        if diffusion_step is not None:
-            diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
-            y = y + diffusion_step
-
-        y = self.conv_layer(y)
-
-        if condition is not None:
-            condition = self.condition_projection(condition)
-            y = y + condition
-
-        gate, filter = torch.chunk(y, 2, dim=1)
-        y = torch.sigmoid(gate) * torch.tanh(filter)
-
-        y = self.output_projection(y)
-        residual, skip = torch.chunk(y, 2, dim=1)
-
-        return (x + residual) / math.sqrt(2.0), skip
-
-
-class WaveNet(nn.Module):
-    def __init__(
-        self,
-        input_channels: Optional[int] = None,
-        output_channels: Optional[int] = None,
-        residual_channels: int = 512,
-        residual_layers: int = 20,
-        dilation_cycle: Optional[int] = 4,
-        is_diffusion: bool = False,
-        condition_channels: Optional[int] = None,
-    ):
-        super().__init__()
-
-        # Input projection
-        self.input_projection = None
-        if input_channels is not None and input_channels != residual_channels:
-            self.input_projection = ConvNorm(
-                input_channels, residual_channels, kernel_size=1
-            )
-
-        if input_channels is None:
-            input_channels = residual_channels
-
-        self.input_channels = input_channels
-
-        # Residual layers
-        self.residual_layers = nn.ModuleList(
-            [
-                ResidualBlock(
-                    residual_channels=residual_channels,
-                    use_linear_bias=False,
-                    dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
-                    condition_channels=condition_channels,
-                )
-                for i in range(residual_layers)
-            ]
-        )
-
-        # Skip projection
-        self.skip_projection = ConvNorm(
-            residual_channels, residual_channels, kernel_size=1
-        )
-
-        # Output projection
-        self.output_projection = None
-        if output_channels is not None and output_channels != residual_channels:
-            self.output_projection = ConvNorm(
-                residual_channels, output_channels, kernel_size=1
-            )
-
-        if is_diffusion:
-            self.diffusion_embedding = DiffusionEmbedding(residual_channels)
-            self.mlp = nn.Sequential(
-                LinearNorm(residual_channels, residual_channels * 4, False),
-                Mish(),
-                LinearNorm(residual_channels * 4, residual_channels, False),
-            )
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if getattr(m, "bias", None) is not None:
-                nn.init.constant_(m.bias, 0)
-
-    def forward(self, x, t=None, condition=None):
-        if self.input_projection is not None:
-            x = self.input_projection(x)
-            x = F.silu(x)
-
-        if t is not None:
-            t = self.diffusion_embedding(t)
-            t = self.mlp(t)
-
-        skip = []
-        for layer in self.residual_layers:
-            x, skip_connection = layer(x, condition, t)
-            skip.append(skip_connection)
-
-        x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
-        x = self.skip_projection(x)
-
-        if self.output_projection is not None:
-            x = F.silu(x)
-            x = self.output_projection(x)
-
-        return x
diff --git a/xinference/thirdparty/fish_speech/fish_speech/text/clean.py b/xinference/thirdparty/fish_speech/fish_speech/text/clean.py
index 76d9dc9033..c228dfcd13 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/text/clean.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/text/clean.py
@@ -1,61 +1,24 @@
-import itertools
 import re
 
-LANGUAGE_UNICODE_RANGE_MAP = {
-    "ZH": [(0x4E00, 0x9FFF)],
-    "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
-    "EN": [(0x0000, 0x007F)],
-}
-
 SYMBOLS_MAPPING = {
-    "：": ",",
-    "；": ",",
-    "，": ",",
-    "。": ".",
-    "！": "!",
-    "？": "?",
-    "\n": ".",
-    "·": ",",
-    "、": ",",
-    "...": "…",
     "“": "'",
     "”": "'",
     "‘": "'",
     "’": "'",
-    "（": "'",
-    "）": "'",
-    "(": "'",
-    ")": "'",
-    "《": "'",
-    "》": "'",
-    "【": "'",
-    "】": "'",
-    "[": "'",
-    "]": "'",
-    "—": "-",
-    "～": "-",
-    "~": "-",
-    "・": "-",
-    "「": "'",
-    "」": "'",
-    ";": ",",
-    ":": ",",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "・": "·",
 }
 
 REPLACE_SYMBOL_REGEX = re.compile(
     "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
 )
-ALL_KNOWN_UTF8_RANGE = list(
-    itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
-)
-REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
-    "[^"
-    + "".join(
-        f"{re.escape(chr(start))}-{re.escape(chr(end))}"
-        for start, end in ALL_KNOWN_UTF8_RANGE
-    )
-    + "]"
-)
 
 
 def clean_text(text):
@@ -64,6 +27,5 @@ def clean_text(text):
 
     # Replace all chinese symbols with their english counterparts
     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
-    text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
 
     return text
diff --git a/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py b/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py
index 5528cd3a63..d4bb995487 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py
@@ -71,9 +71,9 @@ def split_text(text, length):
 
     texts = [text]
     texts = map(protect_float, texts)
-    texts = break_text(texts, length, {".", "!", "?"})
+    texts = break_text(texts, length, {".", "!", "?", "。", "！", "？"})
     texts = map(unprotect_float, texts)
-    texts = break_text(texts, length, {","})
+    texts = break_text(texts, length, {",", "，"})
     texts = break_text(texts, length, {" "})
     texts = list(break_text_by_length(texts, length))
 
diff --git a/xinference/thirdparty/fish_speech/fish_speech/train.py b/xinference/thirdparty/fish_speech/fish_speech/train.py
index a6a344097a..41b3642f88 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/train.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/train.py
@@ -1,4 +1,6 @@
 import os
+
+os.environ["USE_LIBUV"] = "0"
 import sys
 from typing import Optional
 
diff --git a/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py b/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py
index 9c183acd7c..4ec3fcac25 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+import os
+
+os.environ["USE_LIBUV"] = "0"
 import datetime
 import html
 import json
-import os
 import platform
 import shutil
 import signal
@@ -469,7 +471,7 @@ def train_process(
                 "--config-name",
                 "firefly_gan_vq",
                 "--checkpoint-path",
-                "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+                "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
             ]
         )
 
@@ -485,7 +487,7 @@ def train_process(
                 "16",
             ]
         )
-        ckpt_path = "checkpoints/fish-speech-1.2-sft/model.pth"
+        ckpt_path = "checkpoints/fish-speech-1.4/model.pth"
         lora_prefix = "lora_" if llama_use_lora else ""
         llama_name = lora_prefix + "text2semantic_" + new_project
         latest = next(
@@ -862,7 +864,7 @@ def llama_quantify(llama_weight, quantify_mode):
                                     minimum=1,
                                     maximum=32,
                                     step=1,
-                                    value=4,
+                                    value=2,
                                 )
                                 llama_data_max_length_slider = gr.Slider(
                                     label=i18n("Maximum Length per Sample"),
@@ -870,7 +872,7 @@ def llama_quantify(llama_weight, quantify_mode):
                                     minimum=1024,
                                     maximum=4096,
                                     step=128,
-                                    value=1024,
+                                    value=2048,
                                 )
                             with gr.Row(equal_height=False):
                                 llama_precision_dropdown = gr.Dropdown(
@@ -925,9 +927,9 @@ def llama_quantify(llama_weight, quantify_mode):
                                         "Type the path or select from the dropdown"
                                     ),
                                     choices=[
-                                        "checkpoints/fish-speech-1.2-sft/model.pth",
+                                        "checkpoints/fish-speech-1.4/model.pth",
                                     ],
-                                    value="checkpoints/fish-speech-1.2-sft/model.pth",
+                                    value="checkpoints/fish-speech-1.4/model.pth",
                                     allow_custom_value=True,
                                     interactive=True,
                                 )
@@ -979,7 +981,7 @@ def llama_quantify(llama_weight, quantify_mode):
                                         "Type the path or select from the dropdown"
                                     ),
                                     choices=list_llama_models(),
-                                    value="checkpoints/fish-speech-1.2-sft",
+                                    value="checkpoints/fish-speech-1.4",
                                     allow_custom_value=True,
                                     interactive=True,
                                 )
@@ -1042,7 +1044,7 @@ def llama_quantify(llama_weight, quantify_mode):
                                         "Type the path or select from the dropdown"
                                     ),
                                     choices=list_decoder_models(),
-                                    value="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+                                    value="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
                                     allow_custom_value=True,
                                 )
                                 infer_decoder_config = gr.Dropdown(
@@ -1060,7 +1062,7 @@ def llama_quantify(llama_weight, quantify_mode):
                                     info=i18n(
                                         "Type the path or select from the dropdown"
                                     ),
-                                    value="checkpoints/fish-speech-1.2-sft",
+                                    value="checkpoints/fish-speech-1.4",
                                     choices=list_llama_models(),
                                     allow_custom_value=True,
                                 )
diff --git a/xinference/thirdparty/fish_speech/tools/api.py b/xinference/thirdparty/fish_speech/tools/api.py
index 29869b267f..7fcc9330ae 100644
--- a/xinference/thirdparty/fish_speech/tools/api.py
+++ b/xinference/thirdparty/fish_speech/tools/api.py
@@ -9,16 +9,20 @@
 from argparse import ArgumentParser
 from http import HTTPStatus
 from pathlib import Path
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Any, Literal, Optional
 
 import numpy as np
+import ormsgpack
 # import pyrootutils
 import soundfile as sf
 import torch
 import torchaudio
+# from baize.datastructures import ContentType
 # from kui.asgi import (
 #     Body,
+#     FactoryClass,
 #     HTTPException,
+#     HttpRequest,
 #     HttpView,
 #     JSONResponse,
 #     Kui,
@@ -27,14 +31,16 @@
 # )
 # from kui.asgi.routing import MultimethodRoutes
 from loguru import logger
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, conint
 
 # pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 
 # from fish_speech.models.vqgan.lit_module import VQGAN
 from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
 from fish_speech.utils import autocast_exclude_mps
-# from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
@@ -82,11 +88,8 @@ async def other_exception_handler(exc: "Exception"):
 
 def load_audio(reference_audio, sr):
     if len(reference_audio) > 255 or not Path(reference_audio).exists():
-        try:
-            audio_data = base64.b64decode(reference_audio)
-            reference_audio = io.BytesIO(audio_data)
-        except base64.binascii.Error:
-            raise ValueError("Invalid path or base64 string")
+        audio_data = reference_audio
+        reference_audio = io.BytesIO(audio_data)
 
     waveform, original_sr = torchaudio.load(
         reference_audio, backend="sox" if sys.platform == "linux" else "soundfile"
@@ -145,7 +148,7 @@ def decode_vq_tokens(
         return decoder_model.decode(
             indices=codes[None],
             feature_lengths=feature_lengths,
-        ).squeeze()
+        )[0].squeeze()
 
     raise ValueError(f"Unknown model type: {type(decoder_model)}")
 
@@ -153,58 +156,6 @@ def decode_vq_tokens(
 # routes = MultimethodRoutes(base_class=HttpView)
 
 
-def get_random_paths(base_path, data, speaker, emotion):
-    if base_path and data and speaker and emotion and (Path(base_path).exists()):
-        if speaker in data and emotion in data[speaker]:
-            files = data[speaker][emotion]
-            lab_files = [f for f in files if f.endswith(".lab")]
-            wav_files = [f for f in files if f.endswith(".wav")]
-
-            if lab_files and wav_files:
-                selected_lab = random.choice(lab_files)
-                selected_wav = random.choice(wav_files)
-
-                lab_path = Path(base_path) / speaker / emotion / selected_lab
-                wav_path = Path(base_path) / speaker / emotion / selected_wav
-                if lab_path.exists() and wav_path.exists():
-                    return lab_path, wav_path
-
-    return None, None
-
-
-def load_json(json_file):
-    if not json_file:
-        logger.info("Not using a json file")
-        return None
-    try:
-        with open(json_file, "r", encoding="utf-8") as file:
-            data = json.load(file)
-    except FileNotFoundError:
-        logger.warning(f"ref json not found: {json_file}")
-        data = None
-    except Exception as e:
-        logger.warning(f"Loading json failed: {e}")
-        data = None
-    return data
-
-
-class InvokeRequest(BaseModel):
-    text: str = "你说的对, 但是原神是一款由米哈游自主研发的开放世界手游."
-    reference_text: Optional[str] = None
-    reference_audio: Optional[str] = None
-    max_new_tokens: int = 1024
-    chunk_length: Annotated[int, Field(ge=0, le=500, strict=True)] = 100
-    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
-    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
-    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
-    emotion: Optional[str] = None
-    format: Literal["wav", "mp3", "flac"] = "wav"
-    streaming: bool = False
-    ref_json: Optional[str] = "ref_data.json"
-    ref_base: Optional[str] = "ref_data"
-    speaker: Optional[str] = None
-
-
 def get_content_type(audio_format):
     if audio_format == "wav":
         return "audio/wav"
@@ -217,35 +168,52 @@ def get_content_type(audio_format):
 
 
 @torch.inference_mode()
-def inference(req: InvokeRequest):
-    # Parse reference audio aka prompt
-    prompt_tokens = None
-
-    ref_data = load_json(req.ref_json)
-    ref_base = req.ref_base
-
-    lab_path, wav_path = get_random_paths(ref_base, ref_data, req.speaker, req.emotion)
-
-    if lab_path and wav_path:
-        with open(lab_path, "r", encoding="utf-8") as lab_file:
-            ref_text = lab_file.read()
-        req.reference_audio = wav_path
-        req.reference_text = ref_text
-        logger.info("ref_path: " + str(wav_path))
-        logger.info("ref_text: " + ref_text)
-
-    # Parse reference audio aka prompt
-    prompt_tokens = encode_reference(
-        decoder_model=decoder_model,
-        reference_audio=req.reference_audio,
-        enable_reference_audio=req.reference_audio is not None,
-    )
-    logger.info(f"ref_text: {req.reference_text}")
+def inference(req: ServeTTSRequest):
+
+    idstr: str | None = req.reference_id
+    if idstr is not None:
+        ref_folder = Path("references") / idstr
+        ref_folder.mkdir(parents=True, exist_ok=True)
+        ref_audios = list_files(
+            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
+        )
+        prompt_tokens = [
+            encode_reference(
+                decoder_model=decoder_model,
+                reference_audio=audio_to_bytes(str(ref_audio)),
+                enable_reference_audio=True,
+            )
+            for ref_audio in ref_audios
+        ]
+        prompt_texts = [
+            read_ref_text(str(ref_audio.with_suffix(".lab")))
+            for ref_audio in ref_audios
+        ]
+
+    else:
+        # Parse reference audio aka prompt
+        refs = req.references
+        if refs is None:
+            refs = []
+        prompt_tokens = [
+            encode_reference(
+                decoder_model=decoder_model,
+                reference_audio=ref.audio,
+                enable_reference_audio=True,
+            )
+            for ref in refs
+        ]
+        prompt_texts = [ref.text for ref in refs]
+
     # LLAMA Inference
     request = dict(
         device=decoder_model.device,
         max_new_tokens=req.max_new_tokens,
-        text=req.text,
+        text=(
+            req.text
+            if not req.normalize
+            else ChnNormedText(raw_text=req.text).normalize()
+        ),
         top_p=req.top_p,
         repetition_penalty=req.repetition_penalty,
         temperature=req.temperature,
@@ -254,7 +222,7 @@ def inference(req: InvokeRequest):
         chunk_length=req.chunk_length,
         max_length=2048,
         prompt_tokens=prompt_tokens,
-        prompt_text=req.reference_text,
+        prompt_text=prompt_texts,
     )
 
     response_queue = queue.Queue()
@@ -307,40 +275,7 @@ def inference(req: InvokeRequest):
     yield fake_audios
 
 
-def auto_rerank_inference(req: InvokeRequest, use_auto_rerank: bool = True):
-    if not use_auto_rerank:
-        # 如果不使用 auto_rerank，直接调用原始的 inference 函数
-        return inference(req)
-
-    zh_model, en_model = load_model()
-    max_attempts = 5
-    best_wer = float("inf")
-    best_audio = None
-
-    for attempt in range(max_attempts):
-        # 调用原始的 inference 函数
-        audio_generator = inference(req)
-        fake_audios = next(audio_generator)
-
-        asr_result = batch_asr(
-            zh_model if is_chinese(req.text) else en_model, [fake_audios], 44100
-        )[0]
-        wer = calculate_wer(req.text, asr_result["text"])
-
-        if wer <= 0.1 and not asr_result["huge_gap"]:
-            return fake_audios
-
-        if wer < best_wer:
-            best_wer = wer
-            best_audio = fake_audios
-
-        if attempt == max_attempts - 1:
-            break
-
-    return best_audio
-
-
-async def inference_async(req: InvokeRequest):
+async def inference_async(req: ServeTTSRequest):
     for chunk in inference(req):
         yield chunk
 
@@ -349,9 +284,9 @@ async def buffer_to_async_generator(buffer):
     yield buffer
 
 
-# @routes.http.post("/v1/invoke")
+# @routes.http.post("/v1/tts")
 # async def api_invoke_model(
-#     req: Annotated[InvokeRequest, Body(exclusive=True)],
+#     req: Annotated[ServeTTSRequest, Body(exclusive=True)],
 # ):
 #     """
 #     Invoke model and generate audio
@@ -410,21 +345,20 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=str,
-        default="checkpoints/fish-speech-1.2-sft",
+        default="checkpoints/fish-speech-1.4",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=str,
-        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+        default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--compile", action="store_true")
     parser.add_argument("--max-text-length", type=int, default=0)
-    parser.add_argument("--listen", type=str, default="127.0.0.1:8000")
+    parser.add_argument("--listen", type=str, default="127.0.0.1:8080")
     parser.add_argument("--workers", type=int, default=1)
-    parser.add_argument("--use-auto-rerank", type=bool, default=True)
 
     return parser.parse_args()
 
@@ -436,18 +370,30 @@ def parse_args():
 #     },
 # ).routes
 #
+#
+# class MsgPackRequest(HttpRequest):
+#     async def data(self) -> Annotated[Any, ContentType("application/msgpack")]:
+#         if self.content_type == "application/msgpack":
+#             return ormsgpack.unpackb(await self.body)
+#
+#         raise HTTPException(
+#             HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
+#             headers={"Accept": "application/msgpack"},
+#         )
+#
+#
 # app = Kui(
 #     routes=routes + openapi[1:],  # Remove the default route
 #     exception_handlers={
 #         HTTPException: http_execption_handler,
 #         Exception: other_exception_handler,
 #     },
+#     factory_class=FactoryClass(http=MsgPackRequest),
 #     cors_config={},
 # )
 
 
 if __name__ == "__main__":
-    import threading
 
     import uvicorn
 
@@ -474,18 +420,17 @@ def parse_args():
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
     list(
         inference(
-            InvokeRequest(
+            ServeTTSRequest(
                 text="Hello world.",
-                reference_text=None,
-                reference_audio=None,
-                max_new_tokens=0,
+                references=[],
+                reference_id=None,
+                max_new_tokens=1024,
+                chunk_length=200,
                 top_p=0.7,
                 repetition_penalty=1.2,
                 temperature=0.7,
                 emotion=None,
                 format="wav",
-                ref_base=None,
-                ref_json=None,
             )
         )
     )
diff --git a/xinference/thirdparty/fish_speech/tools/auto_rerank.py b/xinference/thirdparty/fish_speech/tools/auto_rerank.py
deleted file mode 100644
index 0297d63d77..0000000000
--- a/xinference/thirdparty/fish_speech/tools/auto_rerank.py
+++ /dev/null
@@ -1,159 +0,0 @@
-﻿import os
-
-os.environ["MODELSCOPE_CACHE"] = ".cache/"
-
-import string
-import time
-from threading import Lock
-
-import librosa
-import numpy as np
-import opencc
-import torch
-from faster_whisper import WhisperModel
-
-t2s_converter = opencc.OpenCC("t2s")
-
-
-def load_model(*, device="cuda"):
-    model = WhisperModel(
-        "medium",
-        device=device,
-        compute_type="float16",
-        download_root="faster_whisper",
-    )
-    print("faster_whisper loaded!")
-    return model
-
-
-@torch.no_grad()
-def batch_asr_internal(model: WhisperModel, audios, sr):
-    resampled_audios = []
-    for audio in audios:
-
-        if isinstance(audio, np.ndarray):
-            audio = torch.from_numpy(audio).float()
-
-        if audio.dim() > 1:
-            audio = audio.squeeze()
-
-        assert audio.dim() == 1
-        audio_np = audio.numpy()
-        resampled_audio = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
-        resampled_audios.append(resampled_audio)
-
-    trans_results = []
-
-    for resampled_audio in resampled_audios:
-        segments, info = model.transcribe(
-            resampled_audio,
-            language=None,
-            beam_size=5,
-            initial_prompt="Punctuation is needed in any language.",
-        )
-        trans_results.append(list(segments))
-
-    results = []
-    for trans_res, audio in zip(trans_results, audios):
-
-        duration = len(audio) / sr * 1000
-        huge_gap = False
-        max_gap = 0.0
-
-        text = None
-        last_tr = None
-
-        for tr in trans_res:
-            delta = tr.text.strip()
-            if tr.id > 1:
-                max_gap = max(tr.start - last_tr.end, max_gap)
-                text += delta
-            else:
-                text = delta
-
-            last_tr = tr
-            if max_gap > 3.0:
-                huge_gap = True
-                break
-
-        sim_text = t2s_converter.convert(text)
-        results.append(
-            {
-                "text": sim_text,
-                "duration": duration,
-                "huge_gap": huge_gap,
-            }
-        )
-
-    return results
-
-
-global_lock = Lock()
-
-
-def batch_asr(model, audios, sr):
-    return batch_asr_internal(model, audios, sr)
-
-
-def is_chinese(text):
-    return True
-
-
-def calculate_wer(text1, text2, debug=False):
-    chars1 = remove_punctuation(text1)
-    chars2 = remove_punctuation(text2)
-
-    m, n = len(chars1), len(chars2)
-
-    if m > n:
-        chars1, chars2 = chars2, chars1
-        m, n = n, m
-
-    prev = list(range(m + 1))  # row 0 distance: [0, 1, 2, ...]
-    curr = [0] * (m + 1)
-
-    for j in range(1, n + 1):
-        curr[0] = j
-        for i in range(1, m + 1):
-            if chars1[i - 1] == chars2[j - 1]:
-                curr[i] = prev[i - 1]
-            else:
-                curr[i] = min(prev[i], curr[i - 1], prev[i - 1]) + 1
-        prev, curr = curr, prev
-
-    edits = prev[m]
-    tot = max(len(chars1), len(chars2))
-    wer = edits / tot
-
-    if debug:
-        print("            gt:   ", chars1)
-        print("          pred:   ", chars2)
-        print(" edits/tot = wer: ", edits, "/", tot, "=", wer)
-
-    return wer
-
-
-def remove_punctuation(text):
-    chinese_punctuation = (
-        " \n\t”“！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—"
-        '‛""„‟…‧﹏'
-    )
-    all_punctuation = string.punctuation + chinese_punctuation
-    translator = str.maketrans("", "", all_punctuation)
-    text_without_punctuation = text.translate(translator)
-    return text_without_punctuation
-
-
-if __name__ == "__main__":
-    model = load_model()
-    audios = [
-        librosa.load("44100.wav", sr=44100)[0],
-        librosa.load("lengyue.wav", sr=44100)[0],
-    ]
-    print(np.array(audios[0]))
-    print(batch_asr(model, audios, 44100))
-
-    start_time = time.time()
-    for _ in range(10):
-        print(batch_asr(model, audios, 44100))
-    print("Time taken:", time.time() - start_time)
diff --git a/xinference/thirdparty/fish_speech/tools/commons.py b/xinference/thirdparty/fish_speech/tools/commons.py
new file mode 100644
index 0000000000..f81cadec1e
--- /dev/null
+++ b/xinference/thirdparty/fish_speech/tools/commons.py
@@ -0,0 +1,35 @@
+from typing import Annotated, Literal, Optional
+
+from pydantic import BaseModel, Field, conint
+
+
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+
+
+class ServeTTSRequest(BaseModel):
+    text: str
+    chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
+    # Audio format
+    format: Literal["wav", "pcm", "mp3"] = "wav"
+    mp3_bitrate: Literal[64, 128, 192] = 128
+    # References audios for in-context learning
+    references: list[ServeReferenceAudio] = []
+    # Reference id
+    # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
+    # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
+    reference_id: str | None = None
+    # Normalize text for en & zh, this increase stability for numbers
+    normalize: bool = True
+    mp3_bitrate: Optional[int] = 64
+    opus_bitrate: Optional[int] = -1000
+    # Balance mode will reduce latency to 300ms, but may decrease stability
+    latency: Literal["normal", "balanced"] = "normal"
+    # not usually used below
+    streaming: bool = False
+    emotion: Optional[str] = None
+    max_new_tokens: int = 1024
+    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
diff --git a/xinference/thirdparty/fish_speech/tools/download_models.py b/xinference/thirdparty/fish_speech/tools/download_models.py
index 480f3be0f4..9e79c34c43 100644
--- a/xinference/thirdparty/fish_speech/tools/download_models.py
+++ b/xinference/thirdparty/fish_speech/tools/download_models.py
@@ -22,8 +22,8 @@ def check_and_download_files(repo_id, file_list, local_dir):
 
 
 # 1st
-repo_id_1 = "fishaudio/fish-speech-1.2-sft"
-local_dir_1 = "./checkpoints/fish-speech-1.2-sft"
+repo_id_1 = "fishaudio/fish-speech-1.4"
+local_dir_1 = "./checkpoints/fish-speech-1.4"
 files_1 = [
     "model.pth",
     "README.md",
@@ -31,7 +31,7 @@ def check_and_download_files(repo_id, file_list, local_dir):
     "tokenizer_config.json",
     "tokenizer.json",
     "config.json",
-    "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 ]
 
 # 3rd
diff --git a/xinference/thirdparty/fish_speech/tools/file.py b/xinference/thirdparty/fish_speech/tools/file.py
index b4b8051d6f..f7a0597365 100644
--- a/xinference/thirdparty/fish_speech/tools/file.py
+++ b/xinference/thirdparty/fish_speech/tools/file.py
@@ -1,3 +1,4 @@
+import base64
 from pathlib import Path
 from typing import Union
 
@@ -23,6 +24,22 @@
 }
 
 
+def audio_to_bytes(file_path):
+    if not file_path or not Path(file_path).exists():
+        return None
+    with open(file_path, "rb") as wav_file:
+        wav = wav_file.read()
+    return wav
+
+
+def read_ref_text(ref_text):
+    path = Path(ref_text)
+    if path.exists() and path.is_file():
+        with path.open("r", encoding="utf-8") as file:
+            return file.read()
+    return ref_text
+
+
 def list_files(
     path: Union[Path, str],
     extensions: set[str] = None,
diff --git a/xinference/thirdparty/fish_speech/tools/gen_ref.py b/xinference/thirdparty/fish_speech/tools/gen_ref.py
deleted file mode 100644
index a771903b02..0000000000
--- a/xinference/thirdparty/fish_speech/tools/gen_ref.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import json
-from pathlib import Path
-
-
-def scan_folder(base_path):
-    wav_lab_pairs = {}
-
-    base = Path(base_path)
-    for suf in ["wav", "lab"]:
-        for f in base.rglob(f"*.{suf}"):
-            relative_path = f.relative_to(base)
-            parts = relative_path.parts
-            print(parts)
-            if len(parts) >= 3:
-                character = parts[0]
-                emotion = parts[1]
-
-                if character not in wav_lab_pairs:
-                    wav_lab_pairs[character] = {}
-                if emotion not in wav_lab_pairs[character]:
-                    wav_lab_pairs[character][emotion] = []
-                wav_lab_pairs[character][emotion].append(str(f.name))
-
-    return wav_lab_pairs
-
-
-def save_to_json(data, output_file):
-    with open(output_file, "w", encoding="utf-8") as file:
-        json.dump(data, file, ensure_ascii=False, indent=2)
-
-
-base_path = "ref_data"
-out_ref_file = "ref_data.json"
-
-wav_lab_pairs = scan_folder(base_path)
-save_to_json(wav_lab_pairs, out_ref_file)
diff --git a/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py b/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py
index 20e2219956..fc5ef120cc 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py
@@ -13,7 +13,7 @@
 
 from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData
 from fish_speech.datasets.protos.text_data_stream import pack_pb_stream
-from fish_speech.utils.file import load_filelist
+from tools.file import load_filelist
 
 # To avoid CPU overload
 os.environ["MKL_NUM_THREADS"] = "1"
diff --git a/xinference/thirdparty/fish_speech/tools/llama/generate.py b/xinference/thirdparty/fish_speech/tools/llama/generate.py
index 934c185145..ad9c549996 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/generate.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/generate.py
@@ -2,6 +2,7 @@
 import queue
 import threading
 import time
+from contextlib import nullcontext
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, Optional, Tuple, Union
@@ -93,15 +94,20 @@ def decode_one_token_ar(
     **sampling_kwargs,
 ) -> torch.Tensor:
     x = model.forward_generate(x, input_pos)
+
+    sampling_kwargs_main = sampling_kwargs.copy()
+    sampling_kwargs_main["temperature"] = 0.1
+    sampling_kwargs_main["top_p"] = 0.1
+    sampling_kwargs_main["repetition_penalty"] = 1.0
+
     codebooks = [
         sample(
             x.logits,
-            previous_tokens=(
-                previous_tokens[0] if previous_tokens is not None else None
-            ),  # Disable repetition penalty for the token codebook
-            **sampling_kwargs,
+            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            **sampling_kwargs_main,
         )[0]
     ]
+
     x = x.hidden_states
 
     # Cleanup the cache
@@ -136,11 +142,16 @@ def decode_one_token_naive(
 ) -> torch.Tensor:
     x = model.forward_generate(x, input_pos)
 
+    sampling_kwargs_main = sampling_kwargs.copy()
+    sampling_kwargs_main["temperature"] = 0.1
+    sampling_kwargs_main["top_p"] = 0.1
+    sampling_kwargs_main["repetition_penalty"] = 1.0
+
     codebooks = [
         sample(
-            x.token_logits,
+            x.logits,
             previous_tokens=None,  # Disable repetition penalty for the token codebook
-            **sampling_kwargs,
+            **sampling_kwargs_main,
         )[0]
     ]
 
@@ -181,8 +192,12 @@ def decode_n_tokens(
         else:
             window = previous_tokens[:, i - win_size : i]
 
-        with torch.backends.cuda.sdp_kernel(
-            enable_flash=False, enable_mem_efficient=False, enable_math=True
+        with (
+            torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_mem_efficient=False, enable_math=True
+            )
+            if torch.cuda.is_available()
+            else nullcontext()
         ):  # Actually better for Inductor to codegen attention here
             next_token = decode_one_token(
                 model=model,
@@ -222,25 +237,11 @@ def generate(
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(1)
 
-    if max_new_tokens:
-        if T + max_new_tokens > model.config.max_seq_len:
-            max_new_tokens = model.config.max_seq_len - T
-            logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
-
-        T_new = T + max_new_tokens
-    else:
-        T_new = model.config.max_seq_len
-        max_new_tokens = T_new - T
-
     device, dtype = prompt.device, prompt.dtype
-    with torch.device(device):
-        model.setup_caches(
-            max_batch_size=1, max_seq_len=T_new, dtype=next(model.parameters()).dtype
-        )
 
     codebook_dim = 1 + model.config.num_codebooks
     # create an empty tensor of the expected final shape and fill in the current tokens
-    empty = torch.empty((codebook_dim, T_new), dtype=dtype, device=device)
+    empty = torch.empty((codebook_dim, max_new_tokens), dtype=dtype, device=device)
     empty[:, :T] = prompt
     seq = empty
     input_pos = torch.arange(0, T, device=device)
@@ -560,6 +561,10 @@ def worker():
         model, decode_one_token = load_model(
             checkpoint_path, device, precision, compile=compile
         )
+        with torch.device(device):
+            model.setup_caches(
+                max_batch_size=1, max_seq_len=2048, dtype=next(model.parameters()).dtype
+            )
         init_event.set()
 
         while True:
@@ -607,7 +612,7 @@ def worker():
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
-    default="checkpoints/fish-speech-1.2-sft",
+    default="checkpoints/fish-speech-1.4",
 )
 @click.option("--device", type=str, default="cuda")
 @click.option("--compile/--no-compile", default=False)
diff --git a/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py b/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py
index f12eece8d2..c1bd3cbd72 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py
@@ -15,7 +15,7 @@
 
 @click.command()
 @click.option("--lora-config", type=str, default="r_8_alpha_16")
-@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.2-sft")
+@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.4")
 @click.option("--lora-weight", type=str, required=True)
 @click.option("--output", type=str, required=True)
 def merge(lora_config, base_weight, lora_weight, output):
diff --git a/xinference/thirdparty/fish_speech/tools/llama/quantize.py b/xinference/thirdparty/fish_speech/tools/llama/quantize.py
index aae32fcce7..e629d944b5 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/quantize.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/quantize.py
@@ -428,7 +428,7 @@ def generate_folder_name():
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
-    default="checkpoints/fish-speech-1.2-sft",
+    default="checkpoints/fish-speech-1.4",
 )
 @click.option(
     "--mode", type=str, default="int8", help="type of quantization to perform"
@@ -451,7 +451,7 @@ def quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) -
         precision=precision,
         compile=False,
     )
-    vq_model = "firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
+    vq_model = "firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
     now = timestamp if timestamp != "None" else generate_folder_name()
 
     if mode == "int8":
diff --git a/xinference/thirdparty/fish_speech/tools/merge_asr_files.py b/xinference/thirdparty/fish_speech/tools/merge_asr_files.py
deleted file mode 100644
index cc12062095..0000000000
--- a/xinference/thirdparty/fish_speech/tools/merge_asr_files.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-from pathlib import Path
-
-from pydub import AudioSegment
-from tqdm import tqdm
-
-from tools.file import AUDIO_EXTENSIONS, list_files
-
-
-def merge_and_delete_files(save_dir, original_files):
-    save_path = Path(save_dir)
-    audio_slice_files = list_files(
-        path=save_dir, extensions=AUDIO_EXTENSIONS.union([".lab"]), recursive=True
-    )
-    audio_files = {}
-    label_files = {}
-    for file_path in tqdm(audio_slice_files, desc="Merging audio files"):
-        rel_path = Path(file_path).relative_to(save_path)
-        (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
-        if file_path.suffix == ".wav":
-            prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0]
-            if prefix == rel_path.parent / file_path.stem:
-                continue
-            audio = AudioSegment.from_wav(file_path)
-            if prefix in audio_files.keys():
-                audio_files[prefix] = audio_files[prefix] + audio
-            else:
-                audio_files[prefix] = audio
-
-        elif file_path.suffix == ".lab":
-            prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0]
-            if prefix == rel_path.parent / file_path.stem:
-                continue
-            with open(file_path, "r", encoding="utf-8") as f:
-                label = f.read()
-            if prefix in label_files.keys():
-                label_files[prefix] = label_files[prefix] + ", " + label
-            else:
-                label_files[prefix] = label
-
-    for prefix, audio in audio_files.items():
-        output_audio_path = save_path / f"{prefix}.wav"
-        audio.export(output_audio_path, format="wav")
-
-    for prefix, label in label_files.items():
-        output_label_path = save_path / f"{prefix}.lab"
-        with open(output_label_path, "w", encoding="utf-8") as f:
-            f.write(label)
-
-    for file_path in original_files:
-        os.remove(file_path)
-
-
-if __name__ == "__main__":
-    merge_and_delete_files("/made/by/spicysama/laziman", [__file__])
diff --git a/xinference/thirdparty/fish_speech/tools/msgpack_api.py b/xinference/thirdparty/fish_speech/tools/msgpack_api.py
new file mode 100644
index 0000000000..67f907bf55
--- /dev/null
+++ b/xinference/thirdparty/fish_speech/tools/msgpack_api.py
@@ -0,0 +1,34 @@
+import httpx
+import ormsgpack
+
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+
+# priority: ref_id > references
+request = ServeTTSRequest(
+    text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+    # reference_id="114514",
+    references=[
+        ServeReferenceAudio(
+            audio=open("lengyue.wav", "rb").read(),
+            text=open("lengyue.lab", "r", encoding="utf-8").read(),
+        )
+    ],
+    streaming=True,
+)
+
+with (
+    httpx.Client() as client,
+    open("hello.wav", "wb") as f,
+):
+    with client.stream(
+        "POST",
+        "http://127.0.0.1:8080/v1/tts",
+        content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+        headers={
+            "authorization": "Bearer YOUR_API_KEY",
+            "content-type": "application/msgpack",
+        },
+        timeout=None,
+    ) as response:
+        for chunk in response.iter_bytes():
+            f.write(chunk)
diff --git a/xinference/thirdparty/fish_speech/tools/post_api.py b/xinference/thirdparty/fish_speech/tools/post_api.py
index 153893078e..c20dc455c3 100644
--- a/xinference/thirdparty/fish_speech/tools/post_api.py
+++ b/xinference/thirdparty/fish_speech/tools/post_api.py
@@ -1,40 +1,19 @@
 import argparse
 import base64
-import json
 import wave
-from pathlib import Path
 
+import ormsgpack
 import pyaudio
 import requests
+from pydub import AudioSegment
+from pydub.playback import play
 
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+from tools.file import audio_to_bytes, read_ref_text
 
-def wav_to_base64(file_path):
-    if not file_path or not Path(file_path).exists():
-        return None
-    with open(file_path, "rb") as wav_file:
-        wav_content = wav_file.read()
-        base64_encoded = base64.b64encode(wav_content)
-        return base64_encoded.decode("utf-8")
 
+def parse_args():
 
-def read_ref_text(ref_text):
-    path = Path(ref_text)
-    if path.exists() and path.is_file():
-        with path.open("r", encoding="utf-8") as file:
-            return file.read()
-    return ref_text
-
-
-def play_audio(audio_content, format, channels, rate):
-    p = pyaudio.PyAudio()
-    stream = p.open(format=format, channels=channels, rate=rate, output=True)
-    stream.write(audio_content)
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-
-if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Send a WAV file and text to a server and receive synthesized audio."
     )
@@ -43,16 +22,24 @@ def play_audio(audio_content, format, channels, rate):
         "--url",
         "-u",
         type=str,
-        default="http://127.0.0.1:8080/v1/invoke",
+        default="http://127.0.0.1:8080/v1/tts",
         help="URL of the server",
     )
     parser.add_argument(
         "--text", "-t", type=str, required=True, help="Text to be synthesized"
     )
+    parser.add_argument(
+        "--reference_id",
+        "-id",
+        type=str,
+        default=None,
+        help="ID of the reference model o be used for the speech",
+    )
     parser.add_argument(
         "--reference_audio",
         "-ra",
         type=str,
+        nargs="+",
         default=None,
         help="Path to the WAV file",
     )
@@ -60,9 +47,30 @@ def play_audio(audio_content, format, channels, rate):
         "--reference_text",
         "-rt",
         type=str,
+        nargs="+",
         default=None,
         help="Reference text for voice synthesis",
     )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="generated_audio",
+        help="Output audio file name",
+    )
+    parser.add_argument(
+        "--play",
+        type=bool,
+        default=True,
+        help="Whether to play audio after receiving data",
+    )
+    parser.add_argument("--normalize", type=bool, default=True)
+    parser.add_argument(
+        "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
+    )
+    parser.add_argument("--mp3_bitrate", type=int, default=64)
+    parser.add_argument("--opus_bitrate", type=int, default=-1000)
+    parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
     parser.add_argument(
         "--max_new_tokens",
         type=int,
@@ -88,7 +96,6 @@ def play_audio(audio_content, format, channels, rate):
         "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
     )
     parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
-    parser.add_argument("--format", type=str, default="wav", help="Audio format")
     parser.add_argument(
         "--streaming", type=bool, default=False, help="Enable streaming response"
     )
@@ -97,18 +104,42 @@ def play_audio(audio_content, format, channels, rate):
     )
     parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
 
-    args = parser.parse_args()
+    return parser.parse_args()
 
-    base64_audio = wav_to_base64(args.reference_audio)
 
-    ref_text = args.reference_text
-    if ref_text:
-        ref_text = read_ref_text(ref_text)
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    idstr: str | None = args.reference_id
+    # priority: ref_id > [{text, audio},...]
+    if idstr is None:
+        ref_audios = args.reference_audio
+        ref_texts = args.reference_text
+        if ref_audios is None:
+            byte_audios = []
+        else:
+            byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
+        if ref_texts is None:
+            ref_texts = []
+        else:
+            ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
+    else:
+        byte_audios = []
+        ref_texts = []
+        pass  # in api.py
 
     data = {
         "text": args.text,
-        "reference_text": ref_text,
-        "reference_audio": base64_audio,
+        "references": [
+            ServeReferenceAudio(audio=ref_audio, text=ref_text)
+            for ref_text, ref_audio in zip(ref_texts, byte_audios)
+        ],
+        "reference_id": idstr,
+        "normalize": args.normalize,
+        "format": args.format,
+        "mp3_bitrate": args.mp3_bitrate,
+        "opus_bitrate": args.opus_bitrate,
         "max_new_tokens": args.max_new_tokens,
         "chunk_length": args.chunk_length,
         "top_p": args.top_p,
@@ -116,22 +147,30 @@ def play_audio(audio_content, format, channels, rate):
         "temperature": args.temperature,
         "speaker": args.speaker,
         "emotion": args.emotion,
-        "format": args.format,
         "streaming": args.streaming,
     }
 
-    response = requests.post(args.url, json=data, stream=args.streaming)
+    pydantic_data = ServeTTSRequest(**data)
 
-    audio_format = pyaudio.paInt16  # Assuming 16-bit PCM format
+    response = requests.post(
+        args.url,
+        data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+        stream=args.streaming,
+        headers={
+            "authorization": "Bearer YOUR_API_KEY",
+            "content-type": "application/msgpack",
+        },
+    )
 
     if response.status_code == 200:
         if args.streaming:
             p = pyaudio.PyAudio()
+            audio_format = pyaudio.paInt16  # Assuming 16-bit PCM format
             stream = p.open(
                 format=audio_format, channels=args.channels, rate=args.rate, output=True
             )
 
-            wf = wave.open("generated_audio.wav", "wb")
+            wf = wave.open(f"{args.output}.wav", "wb")
             wf.setnchannels(args.channels)
             wf.setsampwidth(p.get_sample_size(audio_format))
             wf.setframerate(args.rate)
@@ -153,12 +192,14 @@ def play_audio(audio_content, format, channels, rate):
                 wf.close()
         else:
             audio_content = response.content
-
-            with open("generated_audio.wav", "wb") as audio_file:
+            audio_path = f"{args.output}.{args.format}"
+            with open(audio_path, "wb") as audio_file:
                 audio_file.write(audio_content)
 
-            play_audio(audio_content, audio_format, args.channels, args.rate)
-            print("Audio has been saved to 'generated_audio.wav'.")
+            audio = AudioSegment.from_file(audio_path, format=args.format)
+            if args.play:
+                play(audio)
+            print(f"Audio has been saved to '{audio_path}'.")
     else:
         print(f"Request failed with status code {response.status_code}")
         print(response.json())
diff --git a/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py b/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py
index 02c15a5976..6789316d51 100644
--- a/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py
+++ b/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py
@@ -26,7 +26,7 @@ def uvr5_cli(
     output_folder: Path,
     audio_files: list[Path] | None = None,
     output_format: str = "flac",
-    model: str = "BS-Roformer-Viperx-1296.ckpt",
+    model: str = "BS-Roformer-Viperx-1297.ckpt",
 ):
     # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
     sepr = Separator(
diff --git a/xinference/thirdparty/fish_speech/tools/smart_pad.py b/xinference/thirdparty/fish_speech/tools/smart_pad.py
index 9772168f51..de9dc154f2 100644
--- a/xinference/thirdparty/fish_speech/tools/smart_pad.py
+++ b/xinference/thirdparty/fish_speech/tools/smart_pad.py
@@ -15,21 +15,34 @@
 
 def process(file):
     waveform, sample_rate = torchaudio.load(str(file), backend="sox")
+    if waveform.size(0) > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+
     loudness = librosa.feature.rms(
         y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
     )[0]
+
     for i in range(len(loudness) - 1, 0, -1):
         if loudness[i] > threshold:
             break
 
-    silent_time = (len(loudness) - i) * 512 / sample_rate
+    end_silent_time = (len(loudness) - i) * 512 / sample_rate
 
-    if silent_time <= 0.3:
-        random_time = random.uniform(0.3, 0.7)
+    if end_silent_time <= 0.3:
+        random_time = random.uniform(0.3, 0.7) - end_silent_time
         waveform = F.pad(
             waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
         )
 
+    for i in range(len(loudness)):
+        if loudness[i] > threshold:
+            break
+
+    start_silent_time = i * 512 / sample_rate
+
+    if start_silent_time > 0.02:
+        waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]
+
     torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
 
 
diff --git a/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py b/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py
index bc6bc40830..c24eb3f46a 100644
--- a/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py
+++ b/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py
@@ -42,7 +42,7 @@
 @lru_cache(maxsize=1)
 def get_model(
     config_name: str = "firefly_gan_vq",
-    checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     device: str | torch.device = "cuda",
 ):
     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
@@ -133,7 +133,7 @@ def process_batch(files: list[Path], model) -> float:
 @click.option("--config-name", default="firefly_gan_vq")
 @click.option(
     "--checkpoint-path",
-    default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 )
 @click.option("--batch-size", default=64)
 @click.option("--filelist", default=None, type=Path)
diff --git a/xinference/thirdparty/fish_speech/tools/vqgan/inference.py b/xinference/thirdparty/fish_speech/tools/vqgan/inference.py
index 17c9034d7b..b6bc7531c4 100644
--- a/xinference/thirdparty/fish_speech/tools/vqgan/inference.py
+++ b/xinference/thirdparty/fish_speech/tools/vqgan/inference.py
@@ -59,7 +59,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
 @click.option("--config-name", default="firefly_gan_vq")
 @click.option(
     "--checkpoint-path",
-    default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 )
 @click.option(
     "--device",
@@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
 
     # Restore
     feature_lengths = torch.tensor([indices.shape[1]], device=device)
-    fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
+    fake_audios, _ = model.decode(
+        indices=indices[None], feature_lengths=feature_lengths
+    )
     audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
 
     logger.info(
diff --git a/xinference/thirdparty/fish_speech/tools/webui.py b/xinference/thirdparty/fish_speech/tools/webui.py
index f64ff923b0..a52f548cc9 100644
--- a/xinference/thirdparty/fish_speech/tools/webui.py
+++ b/xinference/thirdparty/fish_speech/tools/webui.py
@@ -23,7 +23,6 @@
 from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
 from fish_speech.utils import autocast_exclude_mps
 from tools.api import decode_vq_tokens, encode_reference
-from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
@@ -40,9 +39,9 @@
 
 {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}  
 
-{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}  
+{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).")}  
 
-{i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}  
+{i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}  
 
 {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}  
 """
@@ -160,66 +159,6 @@ def inference(
         gc.collect()
 
 
-def inference_with_auto_rerank(
-    text,
-    enable_reference_audio,
-    reference_audio,
-    reference_text,
-    max_new_tokens,
-    chunk_length,
-    top_p,
-    repetition_penalty,
-    temperature,
-    use_auto_rerank,
-    streaming=False,
-):
-
-    max_attempts = 2 if use_auto_rerank else 1
-    best_wer = float("inf")
-    best_audio = None
-    best_sample_rate = None
-
-    for attempt in range(max_attempts):
-        audio_generator = inference(
-            text,
-            enable_reference_audio,
-            reference_audio,
-            reference_text,
-            max_new_tokens,
-            chunk_length,
-            top_p,
-            repetition_penalty,
-            temperature,
-            streaming=False,
-        )
-
-        # 获取音频数据
-        for _ in audio_generator:
-            pass
-        _, (sample_rate, audio), message = _
-
-        if audio is None:
-            return None, None, message
-
-        if not use_auto_rerank:
-            return None, (sample_rate, audio), None
-
-        asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
-        wer = calculate_wer(text, asr_result["text"])
-        if wer <= 0.3 and not asr_result["huge_gap"]:
-            return None, (sample_rate, audio), None
-
-        if wer < best_wer:
-            best_wer = wer
-            best_audio = audio
-            best_sample_rate = sample_rate
-
-        if attempt == max_attempts - 1:
-            break
-
-    return None, (best_sample_rate, best_audio), None
-
-
 inference_stream = partial(inference, streaming=True)
 
 n_audios = 4
@@ -239,13 +178,12 @@ def inference_wrapper(
     repetition_penalty,
     temperature,
     batch_infer_num,
-    if_load_asr_model,
 ):
     audios = []
     errors = []
 
     for _ in range(batch_infer_num):
-        result = inference_with_auto_rerank(
+        result = inference(
             text,
             enable_reference_audio,
             reference_audio,
@@ -255,10 +193,9 @@ def inference_wrapper(
             top_p,
             repetition_penalty,
             temperature,
-            if_load_asr_model,
         )
 
-        _, audio_data, error_message = result
+        _, audio_data, error_message = next(result)
 
         audios.append(
             gr.Audio(value=audio_data if audio_data else None, visible=True),
@@ -301,42 +238,6 @@ def normalize_text(user_input, use_normalization):
 asr_model = None
 
 
-def change_if_load_asr_model(if_load):
-    global asr_model
-
-    if if_load:
-        gr.Warning("Loading faster whisper model...")
-        if asr_model is None:
-            asr_model = load_model()
-        return gr.Checkbox(label="Unload faster whisper model", value=if_load)
-
-    if if_load is False:
-        gr.Warning("Unloading faster whisper model...")
-        del asr_model
-        asr_model = None
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-        return gr.Checkbox(label="Load faster whisper model", value=if_load)
-
-
-def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
-    if if_load and asr_model is not None:
-        if (
-            if_auto_label
-            and enable_ref
-            and ref_audio is not None
-            and ref_text.strip() == ""
-        ):
-            data, sample_rate = librosa.load(ref_audio)
-            res = batch_asr(asr_model, [data], sample_rate)[0]
-            ref_text = res["text"]
-    else:
-        gr.Warning("Whisper model not loaded!")
-
-    return gr.Textbox(value=ref_text)
-
-
 def build_app():
     with gr.Blocks(theme=gr.themes.Base()) as app:
         gr.Markdown(HEADER_MD)
@@ -367,23 +268,17 @@ def build_app():
                 with gr.Row():
                     if_refine_text = gr.Checkbox(
                         label=i18n("Text Normalization"),
-                        value=True,
-                        scale=1,
-                    )
-
-                    if_load_asr_model = gr.Checkbox(
-                        label=i18n("Load / Unload ASR model for auto-reranking"),
                         value=False,
-                        scale=3,
+                        scale=1,
                     )
 
                 with gr.Row():
                     with gr.Tab(label=i18n("Advanced Config")):
                         chunk_length = gr.Slider(
                             label=i18n("Iterative Prompt Length, 0 means off"),
-                            minimum=0,
-                            maximum=500,
-                            value=100,
+                            minimum=50,
+                            maximum=300,
+                            value=200,
                             step=8,
                         )
 
@@ -434,12 +329,6 @@ def build_app():
                             type="filepath",
                         )
                         with gr.Row():
-                            if_auto_label = gr.Checkbox(
-                                label=i18n("Auto Labeling"),
-                                min_width=100,
-                                scale=0,
-                                value=False,
-                            )
                             reference_text = gr.Textbox(
                                 label=i18n("Reference Text"),
                                 lines=1,
@@ -494,28 +383,6 @@ def build_app():
             fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
         )
 
-        if_load_asr_model.change(
-            fn=change_if_load_asr_model,
-            inputs=[if_load_asr_model],
-            outputs=[if_load_asr_model],
-        )
-
-        if_auto_label.change(
-            fn=lambda: gr.Textbox(value=""),
-            inputs=[],
-            outputs=[reference_text],
-        ).then(
-            fn=change_if_auto_label,
-            inputs=[
-                if_load_asr_model,
-                if_auto_label,
-                enable_reference_audio,
-                reference_audio,
-                reference_text,
-            ],
-            outputs=[reference_text],
-        )
-
         # # Submit
         generate.click(
             inference_wrapper,
@@ -530,7 +397,6 @@ def build_app():
                 repetition_penalty,
                 temperature,
                 batch_infer_num,
-                if_load_asr_model,
             ],
             [stream_audio, *global_audio_list, *global_error_list],
             concurrency_limit=1,
@@ -560,12 +426,12 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.2-sft",
+        default="checkpoints/fish-speech-1.4",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+        default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
@@ -605,8 +471,8 @@ def parse_args():
             enable_reference_audio=False,
             reference_audio=None,
             reference_text="",
-            max_new_tokens=0,
-            chunk_length=100,
+            max_new_tokens=1024,
+            chunk_length=200,
             top_p=0.7,
             repetition_penalty=1.2,
             temperature=0.7,

From 42d9c340c4ae084c71b215c88d30c2c9d636508e Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 13 Sep 2024 12:16:59 +0800
Subject: [PATCH 04/17] FEAT: support sdapi/img2img (#2293)

---
 xinference/api/restful_api.py   | 67 ++++++++++++++++++++++++++-
 xinference/core/model.py        | 14 ++++++
 xinference/model/image/sdapi.py | 80 +++++++++++++++++++++++++++------
 xinference/types.py             |  2 +-
 4 files changed, 147 insertions(+), 16 deletions(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 15b5cc52d4..d12273ba13 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -63,7 +63,7 @@
     CreateCompletion,
     ImageList,
     PeftModelConfig,
-    SDAPITxt2imgResult,
+    SDAPIResult,
     VideoList,
     max_tokens_field,
 )
@@ -138,6 +138,24 @@ class SDAPITxt2imgRequst(BaseModel):
     width: Optional[int] = 512
     height: Optional[int] = 512
     sampler_name: Optional[str] = None
+    denoising_strength: Optional[float] = None
+    kwargs: Optional[str] = None
+    user: Optional[str] = None
+
+
+class SDAPIImg2imgRequst(BaseModel):
+    model: Optional[str]
+    init_images: Optional[list]
+    prompt: Optional[str] = ""
+    negative_prompt: Optional[str] = ""
+    steps: Optional[int] = None
+    seed: Optional[int] = -1
+    cfg_scale: Optional[float] = 7.0
+    override_settings: Optional[dict] = {}
+    width: Optional[int] = 512
+    height: Optional[int] = 512
+    sampler_name: Optional[str] = None
+    denoising_strength: Optional[float] = None
     kwargs: Optional[str] = None
     user: Optional[str] = None
 
@@ -574,7 +592,18 @@ async def internal_exception_handler(request: Request, exc: Exception):
             "/sdapi/v1/txt2img",
             self.sdapi_txt2img,
             methods=["POST"],
-            response_model=SDAPITxt2imgResult,
+            response_model=SDAPIResult,
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:read"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
+        self._router.add_api_route(
+            "/sdapi/v1/img2img",
+            self.sdapi_img2img,
+            methods=["POST"],
+            response_model=SDAPIResult,
             dependencies=(
                 [Security(self._auth_service, scopes=["models:read"])]
                 if self.is_authenticated()
@@ -1569,6 +1598,40 @@ async def sdapi_txt2img(self, request: Request) -> Response:
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
 
+    async def sdapi_img2img(self, request: Request) -> Response:
+        body = SDAPIImg2imgRequst.parse_obj(await request.json())
+        model_uid = body.model or body.override_settings.get("sd_model_checkpoint")
+
+        try:
+            if not model_uid:
+                raise ValueError("Unknown model")
+            model = await (await self._get_supervisor_ref()).get_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            await self._report_error_event(model_uid, str(ve))
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+
+        try:
+            kwargs = dict(body)
+            kwargs.update(json.loads(body.kwargs) if body.kwargs else {})
+            image_list = await model.img2img(
+                **kwargs,
+            )
+            return Response(content=image_list, media_type="application/json")
+        except RuntimeError as re:
+            logger.error(re, exc_info=True)
+            await self._report_error_event(model_uid, str(re))
+            self.handle_request_limit_error(re)
+            raise HTTPException(status_code=400, detail=str(re))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+
     async def create_variations(
         self,
         model: str = Form(...),
diff --git a/xinference/core/model.py b/xinference/core/model.py
index 327582163c..1f711fb117 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -793,6 +793,20 @@ async def image_to_image(
             f"Model {self._model.model_spec} is not for creating image."
         )
 
+    @request_limit
+    @log_async(logger=logger)
+    async def img2img(
+        self,
+        **kwargs,
+    ):
+        kwargs.pop("request_id", None)
+        if hasattr(self._model, "img2img"):
+            return await self._call_wrapper_json(
+                self._model.img2img,
+                **kwargs,
+            )
+        raise AttributeError(f"Model {self._model.model_spec} is not for img2img.")
+
     @log_async(
         logger=logger,
         ignore_kwargs=["image"],
diff --git a/xinference/model/image/sdapi.py b/xinference/model/image/sdapi.py
index 10337b114d..b3af166299 100644
--- a/xinference/model/image/sdapi.py
+++ b/xinference/model/image/sdapi.py
@@ -11,30 +11,48 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import base64
+import io
 import warnings
 
+from PIL import Image
+
 
 class SDAPIToDiffusersConverter:
-    txt2img_identical_args = [
+    txt2img_identical_args = {
         "prompt",
         "negative_prompt",
         "seed",
         "width",
         "height",
         "sampler_name",
-    ]
+    }
     txt2img_arg_mapping = {
         "steps": "num_inference_steps",
         "cfg_scale": "guidance_scale",
+        "denoising_strength": "strength",
+    }
+    img2img_identical_args = {
+        "prompt",
+        "negative_prompt",
+        "seed",
+        "width",
+        "height",
+        "sampler_name",
+    }
+    img2img_arg_mapping = {
+        "init_images": "image",
+        "steps": "num_inference_steps",
+        "cfg_scale": "guidance_scale",
+        "denoising_strength": "strength",
     }
 
     @staticmethod
-    def convert_txt2img_to_diffusers(params: dict) -> dict:
+    def convert_to_diffusers(sd_type: str, params: dict) -> dict:
         diffusers_params = {}
 
-        identical_args = set(SDAPIToDiffusersConverter.txt2img_identical_args)
-        mapping_args = SDAPIToDiffusersConverter.txt2img_arg_mapping
+        identical_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_identical_args")
+        mapping_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_arg_mapping")
         for param, value in params.items():
             if param in identical_args:
                 diffusers_params[param] = value
@@ -45,13 +63,17 @@ def convert_txt2img_to_diffusers(params: dict) -> dict:
 
         return diffusers_params
 
+    @staticmethod
+    def get_available_args(sd_type: str) -> set:
+        identical_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_identical_args")
+        mapping_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_arg_mapping")
+        return identical_args.union(mapping_args)
+
 
 class SDAPIDiffusionModelMixin:
-    def txt2img(self, **kwargs):
-        available_args = set(
-            SDAPIToDiffusersConverter.txt2img_identical_args
-            + list(SDAPIToDiffusersConverter.txt2img_arg_mapping)
-        )
+    @staticmethod
+    def _check_kwargs(sd_type: str, kwargs: dict):
+        available_args = SDAPIToDiffusersConverter.get_available_args(sd_type)
         unknown_args = []
         available_kwargs = {}
         for arg, value in kwargs.items():
@@ -64,14 +86,20 @@ def txt2img(self, **kwargs):
                 f"Some args are not supported for now and will be ignored: {unknown_args}"
             )
 
-        converted_kwargs = SDAPIToDiffusersConverter.convert_txt2img_to_diffusers(
-            available_kwargs
+        converted_kwargs = SDAPIToDiffusersConverter.convert_to_diffusers(
+            sd_type, available_kwargs
         )
+
         width, height = converted_kwargs.pop("width", None), converted_kwargs.pop(
             "height", None
         )
         if width and height:
             converted_kwargs["size"] = f"{width}*{height}"
+
+        return converted_kwargs
+
+    def txt2img(self, **kwargs):
+        converted_kwargs = self._check_kwargs("txt2img", kwargs)
         result = self.text_to_image(response_format="b64_json", **converted_kwargs)  # type: ignore
 
         # convert to SD API result
@@ -80,3 +108,29 @@ def txt2img(self, **kwargs):
             "info": {"created": result["created"]},
             "parameters": {},
         }
+
+    @staticmethod
+    def _decode_b64_img(img_str: str) -> Image:
+        # img_str in a format: "data:image/png;base64," + raw_b64_img(image)
+        f, data = img_str.split(",", 1)
+        f, encode_type = f.split(";", 1)
+        assert encode_type == "base64"
+        f = f.split("/", 1)[1]
+        b = base64.b64decode(data)
+        return Image.open(io.BytesIO(b), formats=[f])
+
+    def img2img(self, **kwargs):
+        init_images = kwargs.pop("init_images", [])
+        kwargs["init_images"] = [self._decode_b64_img(i) for i in init_images]
+        clip_skip = kwargs.get("override_settings", {}).get("clip_skip")
+        converted_kwargs = self._check_kwargs("img2img", kwargs)
+        if clip_skip:
+            converted_kwargs["clip_skip"] = clip_skip
+        result = self.image_to_image(response_format="b64_json", **converted_kwargs)  # type: ignore
+
+        # convert to SD API result
+        return {
+            "images": [r["b64_json"] for r in result["data"]],
+            "info": {"created": result["created"]},
+            "parameters": {},
+        }
diff --git a/xinference/types.py b/xinference/types.py
index 31c0c28635..613d8709bb 100644
--- a/xinference/types.py
+++ b/xinference/types.py
@@ -47,7 +47,7 @@ class ImageList(TypedDict):
     data: List[Image]
 
 
-class SDAPITxt2imgResult(TypedDict):
+class SDAPIResult(TypedDict):
     images: List[str]
     parameters: dict
     info: dict

From a9380becb24eec1e747a83cf7319a895c5dc3e71 Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 13 Sep 2024 15:24:52 +0800
Subject: [PATCH 05/17] FEAT: support flux.1 image2image and inpainting (#2296)

---
 .github/workflows/python.yaml                 |  1 +
 xinference/model/image/model_spec.json        |  8 ++++++--
 .../model/image/model_spec_modelscope.json    |  8 ++++++--
 .../model/image/stable_diffusion/core.py      | 20 ++++++++++++++++---
 4 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 5be70aa4a0..5c75b2814c 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -171,6 +171,7 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U "loguru"
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U "natsort"
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U "loralib"
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ormsgpack"
             ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y opencc
             ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
             ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
diff --git a/xinference/model/image/model_spec.json b/xinference/model/image/model_spec.json
index 891e9d5765..04386dd2e5 100644
--- a/xinference/model/image/model_spec.json
+++ b/xinference/model/image/model_spec.json
@@ -5,7 +5,9 @@
     "model_id": "black-forest-labs/FLUX.1-schnell",
     "model_revision": "768d12a373ed5cc9ef9a9dea7504dc09fcc14842",
     "model_ability": [
-      "text2image"
+      "text2image",
+      "image2image",
+      "inpainting"
     ]
   },
   {
@@ -14,7 +16,9 @@
     "model_id": "black-forest-labs/FLUX.1-dev",
     "model_revision": "01aa605f2c300568dd6515476f04565a954fcb59",
     "model_ability": [
-      "text2image"
+      "text2image",
+      "image2image",
+      "inpainting"
     ]
   },
   {
diff --git a/xinference/model/image/model_spec_modelscope.json b/xinference/model/image/model_spec_modelscope.json
index bbc5d57010..b39bfc543d 100644
--- a/xinference/model/image/model_spec_modelscope.json
+++ b/xinference/model/image/model_spec_modelscope.json
@@ -6,7 +6,9 @@
     "model_id": "AI-ModelScope/FLUX.1-schnell",
     "model_revision": "master",
     "model_ability": [
-      "text2image"
+      "text2image",
+      "image2image",
+      "inpainting"
     ]
   },
   {
@@ -16,7 +18,9 @@
     "model_id": "AI-ModelScope/FLUX.1-dev",
     "model_revision": "master",
     "model_ability": [
-      "text2image"
+      "text2image",
+      "image2image",
+      "inpainting"
     ]
   },
   {
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index 5344e62de2..eed9739b2c 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -14,6 +14,7 @@
 
 import base64
 import contextlib
+import inspect
 import logging
 import os
 import re
@@ -408,12 +409,24 @@ def image_to_image(
                 width, height = image.size
             kwargs["width"] = width
             kwargs["height"] = height
-
+        else:
+            # SD3 image2image cannot accept width and height
+            parameters = inspect.signature(model.__call__).parameters  # type: ignore
+            allow_width_height = False
+            for param in parameters.values():
+                if param.kind == inspect.Parameter.VAR_KEYWORD:
+                    allow_width_height = True
+                    break
+            if "width" in parameters or "height" in parameters:
+                allow_width_height = True
+            if allow_width_height:
+                kwargs["width"], kwargs["height"] = image.size
+
+        kwargs["negative_prompt"] = negative_prompt
         self._filter_kwargs(kwargs)
         return self._call_model(
             image=image,
             prompt=prompt,
-            negative_prompt=negative_prompt,
             num_images_per_prompt=n,
             response_format=response_format,
             model=model,
@@ -463,11 +476,12 @@ def inpainting(
             # calculate actual image size after padding
             width, height = image.size
 
+        kwargs["negative_prompt"] = negative_prompt
+        self._filter_kwargs(kwargs)
         return self._call_model(
             image=image,
             mask_image=mask_image,
             prompt=prompt,
-            negative_prompt=negative_prompt,
             height=height,
             width=width,
             num_images_per_prompt=n,

From b7c70229886ab06f4e8d0d58ddeb91093f7801bd Mon Sep 17 00:00:00 2001
From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
Date: Fri, 13 Sep 2024 22:22:37 +0800
Subject: [PATCH 06/17] FEAT: Support yi-coder-chat (#2302)

Co-authored-by: JunHowie <JunHowie@aliyun.com>
---
 xinference/model/llm/llm_family.json          | 77 ++++++++++++++++++
 .../model/llm/llm_family_modelscope.json      | 81 +++++++++++++++++++
 xinference/model/llm/vllm/core.py             |  2 +
 3 files changed, 160 insertions(+)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index e997098e65..1dfeca1fb4 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -7093,5 +7093,82 @@
     "stop": [
       "<｜end▁of▁sentence｜>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "yi-coder-chat",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "01ai/Yi-Coder-9B-Chat",
+        "model_revision": "356a1f8d4e4a606d0b879e54191ca809918576b8"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "01ai/Yi-Coder-1.5B-Chat",
+        "model_revision": "92fdd1b2f1539ac990e7f4a921db5601da2f0299"
+      }
+    ],
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2,
+      6,
+      7
+    ],
+    "stop": [
+      "<|startoftext|>",
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "yi-coder",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "01-ai/Yi-Coder-9B",
+        "model_revision": "e20f8087a9507ac8bce409dc5db5d0c608124238"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "01-ai/Yi-Coder-1.5B",
+        "model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109"
+      }
+    ]
   }
 ]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index f4386e85fa..b7b0da1b13 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4808,5 +4808,86 @@
     "stop": [
       "<｜end▁of▁sentence｜>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "yi-coder-chat",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-Coder-9B-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-Coder-1.5B-Chat",
+        "model_revision": "master"
+      }
+    ],
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+    "stop_token_ids": [
+      1,
+      2,
+      6,
+      7
+    ],
+    "stop": [
+      "<|startoftext|>",
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "yi-coder",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-Coder-9B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-Coder-1.5B",
+        "model_revision": "master"
+      }
+    ]
   }
 ]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index e531769a18..811fd5d342 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -104,6 +104,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     "code-llama-python",
     "deepseek",
     "deepseek-coder",
+    "yi-coder",
 ]
 VLLM_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
@@ -130,6 +131,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     "codegeex4",
     "deepseek-chat",
     "deepseek-coder-instruct",
+    "yi-coder-chat",
 ]
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")

From 26666356f80f61d21cca80389f7fc47ea9c1caa7 Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Sat, 14 Sep 2024 12:27:04 +0800
Subject: [PATCH 07/17] BUG: fix sampler_name for img2img (#2301)

---
 xinference/model/image/sdapi.py               |  2 +-
 .../model/image/stable_diffusion/core.py      | 69 +++++++++----------
 2 files changed, 32 insertions(+), 39 deletions(-)

diff --git a/xinference/model/image/sdapi.py b/xinference/model/image/sdapi.py
index b3af166299..6ef21d48ab 100644
--- a/xinference/model/image/sdapi.py
+++ b/xinference/model/image/sdapi.py
@@ -30,7 +30,7 @@ class SDAPIToDiffusersConverter:
     txt2img_arg_mapping = {
         "steps": "num_inference_steps",
         "cfg_scale": "guidance_scale",
-        "denoising_strength": "strength",
+        # "denoising_strength": "strength",
     }
     img2img_identical_args = {
         "prompt",
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index eed9739b2c..5a7e99fe33 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -24,7 +24,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from io import BytesIO
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import PIL.Image
 import torch
@@ -168,7 +168,9 @@ def load(self):
                 self._kwargs[text_encoder_name] = text_encoder
                 self._kwargs["device_map"] = "balanced"
 
-        logger.debug("Loading model %s", AutoPipelineModel)
+        logger.debug(
+            "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
+        )
         self._model = AutoPipelineModel.from_pretrained(
             self._model_path,
             **self._kwargs,
@@ -183,11 +185,12 @@ def load(self):
         self._model.enable_attention_slicing()
         self._apply_lora()
 
-    def _get_scheduler(self, sampler_name: str):
+    @staticmethod
+    def _get_scheduler(model: Any, sampler_name: str):
         if not sampler_name:
             return
 
-        assert self._model is not None
+        assert model is not None
 
         import diffusers
 
@@ -195,80 +198,73 @@ def _get_scheduler(self, sampler_name: str):
         # to get A1111 <> Diffusers Scheduler mapping
         if sampler_name == "DPM++ 2M":
             return diffusers.DPMSolverMultistepScheduler.from_config(
-                self._model.scheduler.config
+                model.scheduler.config
             )
         elif sampler_name == "DPM++ 2M Karras":
             return diffusers.DPMSolverMultistepScheduler.from_config(
-                self._model.scheduler.config, use_karras_sigmas=True
+                model.scheduler.config, use_karras_sigmas=True
             )
         elif sampler_name == "DPM++ 2M SDE":
             return diffusers.DPMSolverMultistepScheduler.from_config(
-                self._model.scheduler.config, algorithm_type="sde-dpmsolver++"
+                model.scheduler.config, algorithm_type="sde-dpmsolver++"
             )
         elif sampler_name == "DPM++ 2M SDE Karras":
             return diffusers.DPMSolverMultistepScheduler.from_config(
-                self._model.scheduler.config,
+                model.scheduler.config,
                 algorithm_type="sde-dpmsolver++",
                 use_karras_sigmas=True,
             )
         elif sampler_name == "DPM++ SDE":
             return diffusers.DPMSolverSinglestepScheduler.from_config(
-                self._model.scheduler.config
+                model.scheduler.config
             )
         elif sampler_name == "DPM++ SDE Karras":
             return diffusers.DPMSolverSinglestepScheduler.from_config(
-                self._model.scheduler.config, use_karras_sigmas=True
+                model.scheduler.config, use_karras_sigmas=True
             )
         elif sampler_name == "DPM2":
-            return diffusers.KDPM2DiscreteScheduler.from_config(
-                self._model.scheduler.config
-            )
+            return diffusers.KDPM2DiscreteScheduler.from_config(model.scheduler.config)
         elif sampler_name == "DPM2 Karras":
             return diffusers.KDPM2DiscreteScheduler.from_config(
-                self._model.scheduler.config, use_karras_sigmas=True
+                model.scheduler.config, use_karras_sigmas=True
             )
         elif sampler_name == "DPM2 a":
             return diffusers.KDPM2AncestralDiscreteScheduler.from_config(
-                self._model.scheduler.config
+                model.scheduler.config
             )
         elif sampler_name == "DPM2 a Karras":
             return diffusers.KDPM2AncestralDiscreteScheduler.from_config(
-                self._model.scheduler.config, use_karras_sigmas=True
+                model.scheduler.config, use_karras_sigmas=True
             )
         elif sampler_name == "Euler":
-            return diffusers.EulerDiscreteScheduler.from_config(
-                self._model.scheduler.config
-            )
+            return diffusers.EulerDiscreteScheduler.from_config(model.scheduler.config)
         elif sampler_name == "Euler a":
             return diffusers.EulerAncestralDiscreteScheduler.from_config(
-                self._model.scheduler.config
+                model.scheduler.config
             )
         elif sampler_name == "Heun":
-            return diffusers.HeunDiscreteScheduler.from_config(
-                self._model.scheduler.config
-            )
+            return diffusers.HeunDiscreteScheduler.from_config(model.scheduler.config)
         elif sampler_name == "LMS":
-            return diffusers.LMSDiscreteScheduler.from_config(
-                self._model.scheduler.config
-            )
+            return diffusers.LMSDiscreteScheduler.from_config(model.scheduler.config)
         elif sampler_name == "LMS Karras":
             return diffusers.LMSDiscreteScheduler.from_config(
-                self._model.scheduler.config, use_karras_sigmas=True
+                model.scheduler.config, use_karras_sigmas=True
             )
         else:
             raise ValueError(f"Unknown sampler: {sampler_name}")
 
+    @staticmethod
     @contextlib.contextmanager
-    def _reset_when_done(self, sampler_name: str):
-        assert self._model is not None
-        scheduler = self._get_scheduler(sampler_name)
+    def _reset_when_done(model: Any, sampler_name: str):
+        assert model is not None
+        scheduler = DiffusionModel._get_scheduler(model, sampler_name)
         if scheduler:
-            default_scheduler = self._model.scheduler
-            self._model.scheduler = scheduler
+            default_scheduler = model.scheduler
+            model.scheduler = scheduler
             try:
                 yield
             finally:
-                self._model.scheduler = default_scheduler
+                model.scheduler = default_scheduler
         else:
             yield
 
@@ -292,11 +288,8 @@ def _call_model(
                 kwargs["generator"] = generator.manual_seed(seed)
         sampler_name = kwargs.pop("sampler_name", None)
         assert callable(model)
-        with self._reset_when_done(sampler_name):
-            logger.debug(
-                "stable diffusion args: %s",
-                kwargs,
-            )
+        with self._reset_when_done(model, sampler_name):
+            logger.debug("stable diffusion args: %s, model: %s", kwargs, model)
             images = model(**kwargs).images
 
         # revert padding if padded

From 961d355102007e3cd7963a353105b2422a31d4fd Mon Sep 17 00:00:00 2001
From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com>
Date: Sat, 14 Sep 2024 07:22:13 +0200
Subject: [PATCH 08/17] FEAT: qwen2 audio (#2271)

---
 xinference/core/tests/test_restful_api.py     |  74 ++++++++
 xinference/model/llm/__init__.py              |   2 +
 xinference/model/llm/llm_family.json          |  74 ++++++++
 xinference/model/llm/llm_family.py            |   4 +-
 .../model/llm/llm_family_modelscope.json      |  68 +++++++
 xinference/model/llm/transformers/core.py     |   2 +
 .../model/llm/transformers/qwen2_audio.py     | 168 ++++++++++++++++++
 7 files changed, 391 insertions(+), 1 deletion(-)
 create mode 100644 xinference/model/llm/transformers/qwen2_audio.py

diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
index 0c50eb256d..af22ca7a8b 100644
--- a/xinference/core/tests/test_restful_api.py
+++ b/xinference/core/tests/test_restful_api.py
@@ -1240,3 +1240,77 @@ def test_launch_model_by_version(setup):
     # delete again
     url = f"{endpoint}/v1/models/test_qwen15"
     requests.delete(url)
+
+
+@pytest.mark.skip(reason="Cost too many resources.")
+def test_restful_api_for_qwen_audio(setup):
+    model_name = "qwen2-audio-instruct"
+
+    endpoint, _ = setup
+    url = f"{endpoint}/v1/models"
+
+    # list
+    response = requests.get(url)
+    response_data = response.json()
+    assert len(response_data["data"]) == 0
+
+    # launch
+    payload = {
+        "model_uid": "test_audio",
+        "model_name": model_name,
+        "model_engine": "transformers",
+        "model_size_in_billions": 7,
+        "model_format": "pytorch",
+        "quantization": "none",
+    }
+
+    response = requests.post(url, json=payload)
+    response_data = response.json()
+    model_uid_res = response_data["model_uid"]
+    assert model_uid_res == "test_audio"
+
+    response = requests.get(url)
+    response_data = response.json()
+    assert len(response_data["data"]) == 1
+
+    url = f"{endpoint}/v1/chat/completions"
+    payload = {
+        "model": model_uid_res,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+                    },
+                    {"type": "text", "text": "What's that sound?"},
+                ],
+            },
+            {"role": "assistant", "content": "It is the sound of glass shattering."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What can you do when you hear that?"},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac",
+                    },
+                    {"type": "text", "text": "What does the person say?"},
+                ],
+            },
+        ],
+    }
+    response = requests.post(url, json=payload)
+    completion = response.json()
+    assert len(completion["choices"][0]["message"]) > 0
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index 5a7895eb1a..f971e65661 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -146,6 +146,7 @@ def _install():
     from .transformers.internlm2 import Internlm2PytorchChatModel
     from .transformers.minicpmv25 import MiniCPMV25Model
     from .transformers.minicpmv26 import MiniCPMV26Model
+    from .transformers.qwen2_audio import Qwen2AudioChatModel
     from .transformers.qwen2_vl import Qwen2VLChatModel
     from .transformers.qwen_vl import QwenVLChatModel
     from .transformers.yi_vl import YiVLChatModel
@@ -177,6 +178,7 @@ def _install():
             Internlm2PytorchChatModel,
             QwenVLChatModel,
             Qwen2VLChatModel,
+            Qwen2AudioChatModel,
             YiVLChatModel,
             DeepSeekVLChatModel,
             InternVLChatModel,
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 1dfeca1fb4..77dda1a84d 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -6947,6 +6947,80 @@
       "</s>"
     ]
   },
+  {
+    "version":1,
+    "context_length":32768,
+    "model_name":"qwen2-audio-instruct",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "audio"
+    ],
+    "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2-Audio-7B-Instruct",
+        "model_revision":"bac62d2c6808845904c709c17a0402d817558c64"
+      }
+    ],
+    "prompt_style":{
+      "style_name":"QWEN",
+      "system_prompt":"You are a helpful assistant",
+      "roles":[
+        "user",
+        "assistant"
+      ],
+      "stop": [
+        "<|im_end|>",
+        "<|endoftext|>"
+      ]
+    }
+  },
+  {
+    "version":1,
+    "context_length":32768,
+    "model_name":"qwen2-audio",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "audio"
+    ],
+    "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2-Audio-7B",
+        "model_revision":"8577bc71d330c8fa32ffe9f8a1374100759f2466"
+      }
+    ],
+    "prompt_style":{
+      "style_name":"QWEN",
+      "system_prompt":"You are a helpful assistant",
+      "roles":[
+        "user",
+        "assistant"
+      ],
+      "stop": [
+        "<|im_end|>",
+        "<|endoftext|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 128000,
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index 555921f18f..413b4229ae 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -132,7 +132,9 @@ class LLMFamilyV1(BaseModel):
     context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
     model_name: str
     model_lang: List[str]
-    model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]]
+    model_ability: List[
+        Literal["embed", "generate", "chat", "tools", "vision", "audio"]
+    ]
     model_description: Optional[str]
     # reason for not required str here: legacy registration
     model_family: Optional[str]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index b7b0da1b13..fdaab458aa 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4656,6 +4656,74 @@
       "</s>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-audio-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "audio"
+    ],
+    "model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "qwen/Qwen2-Audio-7B-Instruct",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-audio",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "audio"
+    ],
+    "model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "qwen/Qwen2-Audio-7B",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 128000,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index a451b7accd..e42ca6d513 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -65,6 +65,8 @@
     "MiniCPM-V-2.6",
     "glm-4v",
     "qwen2-vl-instruct",
+    "qwen2-audio",
+    "qwen2-audio-instruct",
     "deepseek-v2",
     "deepseek-v2-chat",
     "deepseek-v2.5",
diff --git a/xinference/model/llm/transformers/qwen2_audio.py b/xinference/model/llm/transformers/qwen2_audio.py
new file mode 100644
index 0000000000..653f7217f8
--- /dev/null
+++ b/xinference/model/llm/transformers/qwen2_audio.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import uuid
+from io import BytesIO
+from typing import Dict, Iterator, List, Optional, Union
+from urllib.request import urlopen
+
+import numpy as np
+
+from ....model.utils import select_device
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, generate_completion_chunk
+from .core import PytorchChatModel, PytorchGenerateConfig
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2AudioChatModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._processor = None
+        self._model = None
+        self._device = None
+
+    @classmethod
+    def match(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        llm_family = model_family.model_family or model_family.model_name
+        if "qwen2-audio".lower() in llm_family.lower():
+            return True
+        return False
+
+    def load(self):
+        from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+
+        device = self._pytorch_model_config.get("device", "auto")
+        device = select_device(device)
+        self._device = device
+        # for multiple GPU, set back to auto to make multiple devices work
+        device = "auto" if device == "cuda" else device
+
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            device_map=device,
+            # trust_remote_code=True,
+            code_revision=self.model_spec.model_revision,
+        )
+        self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
+            self.model_path,
+            device_map=device,
+            # trust_remote_code=True,
+            revision=self.model_spec.model_revision,
+        )
+
+    def _transform_messages(
+        self,
+        messages: List[Dict],
+    ):
+        import librosa
+
+        text = self._processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=False
+        )
+        audios: List[np.ndarray] = []
+        for msg in messages:
+            content = msg["content"]
+            if isinstance(content, List):
+                for item in content:  # type: ignore
+                    if item.get("type") == "audio" and "audio_url" in item:
+                        audio = librosa.load(
+                            BytesIO(urlopen(item["audio_url"]).read()),
+                            sr=self._processor.feature_extractor.sampling_rate,
+                        )[0]
+                        audios.append(audio)
+
+        return text, audios
+
+    def chat(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        text, audios = self._transform_messages(messages)
+        inputs = self._processor(
+            text=text, audios=audios, return_tensors="pt", padding=True
+        )
+        inputs.input_ids = inputs.input_ids.to(self._device)
+        generate_config = generate_config if generate_config else {}
+        stream = generate_config.get("stream", False) if generate_config else False
+
+        if stream:
+            it = self._generate_stream(inputs, generate_config)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(inputs, generate_config)
+            return c
+
+    def _generate(self, inputs, config: PytorchGenerateConfig = {}) -> ChatCompletion:
+        generate_ids = self._model.generate(
+            **inputs,
+            max_length=config.get("max_tokens", 512),
+        )
+        generate_ids = generate_ids[:, inputs.input_ids.size(1) :]
+        response = self._processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return generate_chat_completion(self.model_uid, response)
+
+    def _generate_stream(
+        self, inputs, config: PytorchGenerateConfig = {}
+    ) -> Iterator[CompletionChunk]:
+        from threading import Thread
+
+        from transformers import TextIteratorStreamer
+
+        tokenizer = self._processor.tokenizer
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+        )
+
+        gen_kwargs = {
+            "max_new_tokens": config.get("max_tokens", 512),
+            "streamer": streamer,
+            **inputs,
+        }
+
+        thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
+        thread.start()
+
+        completion_id = str(uuid.uuid1())
+        for new_text in streamer:
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
+                has_choice=True,
+                has_content=True,
+            )
+
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+            has_choice=True,
+            has_content=False,
+        )

From 4aa58615ae4fd4dc3313411f6b485274f7d31c18 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Sat, 14 Sep 2024 15:23:21 +0800
Subject: [PATCH 09/17] BUG: modify vllm image version (#2311)

---
 xinference/deploy/docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
index 5ee3f11771..810a440ecd 100644
--- a/xinference/deploy/docker/Dockerfile
+++ b/xinference/deploy/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:latest
+FROM vllm/vllm-openai:0.6.0
 
 COPY . /opt/inference
 WORKDIR /opt/inference

From 4c5e752920fba416432cad7debd3722a75c3e8a2 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Sat, 14 Sep 2024 15:37:10 +0800
Subject: [PATCH 10/17] Bug: modify vllm image version (#2312)

Co-authored-by: wuzhaoxin <15667065080@162.com>
---
 xinference/deploy/docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
index 810a440ecd..3d6afc44c3 100644
--- a/xinference/deploy/docker/Dockerfile
+++ b/xinference/deploy/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:0.6.0
+FROM vllm/vllm-openai:v0.6.0
 
 COPY . /opt/inference
 WORKDIR /opt/inference

From 91c0fe85cd153158780e717d41bb3fd8036e43ff Mon Sep 17 00:00:00 2001
From: yiboyasss <143868051+yiboyasss@users.noreply.github.com>
Date: Sun, 15 Sep 2024 17:19:21 +0800
Subject: [PATCH 11/17] BUG: [UI] Fix registration page bug. (#2315)

---
 xinference/web/ui/src/scenes/register_model/registerModel.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js
index ca02e781b4..f35196b3b5 100644
--- a/xinference/web/ui/src/scenes/register_model/registerModel.js
+++ b/xinference/web/ui/src/scenes/register_model/registerModel.js
@@ -686,12 +686,12 @@ const RegisterModelComponent = ({ modelType, customData }) => {
 
   const handleFamilyAlert = () => {
     if (
-      formData.model_ability.includes('vision') &&
+      formData.model_ability?.includes('vision') &&
       !family?.vision?.includes(formData.model_family)
     ) {
       return true
     } else if (
-      formData.model_ability.includes('tools') &&
+      formData.model_ability?.includes('tools') &&
       !family?.tools?.includes(formData.model_family)
     ) {
       return true

From 065686edc64e0af418ca0071f2f5d541dcdbe440 Mon Sep 17 00:00:00 2001
From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com>
Date: Wed, 18 Sep 2024 11:54:00 +0200
Subject: [PATCH 12/17] BUG: Fix CosyVoice missing output (#2320)

---
 xinference/model/audio/cosyvoice.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xinference/model/audio/cosyvoice.py b/xinference/model/audio/cosyvoice.py
index 39bcb7aa6c..9be452f473 100644
--- a/xinference/model/audio/cosyvoice.py
+++ b/xinference/model/audio/cosyvoice.py
@@ -122,10 +122,10 @@ def _generator_stream():
                             last_pos = new_last_pos
 
         def _generator_block():
-            chunk = next(output)
-            assert isinstance(chunk, dict), "Expected data to be of type dict"
+            chunks = [o["tts_speech"] for o in output]
+            t = torch.cat(chunks, dim=1)
             with BytesIO() as out:
-                torchaudio.save(out, chunk["tts_speech"], 22050, format=response_format)
+                torchaudio.save(out, t, 22050, format=response_format)
                 return out.getvalue()
 
         return _generator_stream() if stream else _generator_block()

From a461ad926fa088d567cd7c96a6aba3468b0a0779 Mon Sep 17 00:00:00 2001
From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
Date: Thu, 19 Sep 2024 05:40:53 +0800
Subject: [PATCH 13/17] FEAT: Support Qwen 2.5 (#2325)

---
 xinference/model/llm/llm_family.json          | 360 ++++++++++++++++
 .../model/llm/llm_family_modelscope.json      | 388 ++++++++++++++++++
 xinference/model/llm/vllm/core.py             |   1 +
 3 files changed, 749 insertions(+)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 77dda1a84d..70b17daa61 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -7244,5 +7244,365 @@
         "model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "qwen2.5-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-0.5B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-1.5B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-3B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-7B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-14B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-32B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-72B-Instruct"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-0.5B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-1.5B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-3B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-7B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-14B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-32B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2.5-72B-Instruct-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q5_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "fp16": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ]
+        }
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index fdaab458aa..7309ee9651 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4957,5 +4957,393 @@
         "model_revision": "master"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "qwen2.5-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-0.5B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-1.5B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-3B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-7B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-14B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-32B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-72B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-3B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-7B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions":14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-14B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-32B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-72B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-3B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-14B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-32B-Instruct-GGUF",
+        "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2.5-72B-Instruct-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q5_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "fp16": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ]
+        }
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 811fd5d342..3aaee0738f 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -138,6 +138,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
     VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")

From 9820786a56ade6af783c69e96a8f30319b30f1f2 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:32:00 +0800
Subject: [PATCH 14/17] BUG: support old register llm format (#2335)

---
 xinference/model/llm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index f971e65661..a9f05a9b25 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -121,7 +121,7 @@ def register_custom_model():
                 with codecs.open(
                     os.path.join(user_defined_llm_dir, f), encoding="utf-8"
                 ) as fd:
-                    user_defined_llm_family = CustomLLMFamilyV1.parse_obj(json.load(fd))
+                    user_defined_llm_family = CustomLLMFamilyV1.parse_raw(fd.read())
                     register_llm(user_defined_llm_family, persist=False)
             except Exception as e:
                 warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}")

From 3cc9bc525667e2161ae072d3d892d33b2723b2a9 Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 20 Sep 2024 15:06:33 +0800
Subject: [PATCH 15/17] BUG: fix stable diffusion from dify tool (#2336)

---
 xinference/core/model.py                      |  4 +-
 .../model/image/stable_diffusion/core.py      | 49 ++++++++++++-------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/xinference/core/model.py b/xinference/core/model.py
index 1f711fb117..2274f422c0 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -769,7 +769,7 @@ async def image_to_image(
         self,
         image: "PIL.Image",
         prompt: str,
-        negative_prompt: str,
+        negative_prompt: Optional[str] = None,
         n: int = 1,
         size: Optional[str] = None,
         response_format: str = "url",
@@ -777,12 +777,12 @@ async def image_to_image(
         **kwargs,
     ):
         kwargs.pop("request_id", None)
+        kwargs["negative_prompt"] = negative_prompt
         if hasattr(self._model, "image_to_image"):
             return await self._call_wrapper_json(
                 self._model.image_to_image,
                 image,
                 prompt,
-                negative_prompt,
                 n,
                 size,
                 response_format,
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index 5a7e99fe33..53151b2c19 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -21,6 +21,7 @@
 import sys
 import time
 import uuid
+import warnings
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from io import BytesIO
@@ -31,7 +32,7 @@
 from PIL import ImageOps
 
 from ....constants import XINFERENCE_IMAGE_DIR
-from ....device_utils import move_model_to_available_device
+from ....device_utils import get_available_device, move_model_to_available_device
 from ....types import Image, ImageList, LoRA
 from ..sdapi import SDAPIDiffusionModelMixin
 
@@ -60,6 +61,23 @@
 ]
 
 
+def model_accept_param(params: Union[str, List[str]], model: Any) -> bool:
+    params = [params] if isinstance(params, str) else params
+    # model is diffusers Pipeline
+    parameters = inspect.signature(model.__call__).parameters  # type: ignore
+    allow_params = False
+    for param in parameters.values():
+        if param.kind == inspect.Parameter.VAR_KEYWORD:
+            # the __call__ can accept **kwargs,
+            # we treat it as it can accept any parameters
+            allow_params = True
+            break
+    if not allow_params:
+        if all(param in parameters for param in params):
+            allow_params = True
+    return allow_params
+
+
 class DiffusionModel(SDAPIDiffusionModelMixin):
     def __init__(
         self,
@@ -187,7 +205,7 @@ def load(self):
 
     @staticmethod
     def _get_scheduler(model: Any, sampler_name: str):
-        if not sampler_name:
+        if not sampler_name or sampler_name == "default":
             return
 
         assert model is not None
@@ -283,13 +301,14 @@ def _call_model(
         origin_size = kwargs.pop("origin_size", None)
         seed = kwargs.pop("seed", None)
         if seed is not None:
-            kwargs["generator"] = generator = torch.Generator(device=self._model.device)  # type: ignore
+            kwargs["generator"] = generator = torch.Generator(device=get_available_device())  # type: ignore
             if seed != -1:
                 kwargs["generator"] = generator.manual_seed(seed)
         sampler_name = kwargs.pop("sampler_name", None)
         assert callable(model)
         with self._reset_when_done(model, sampler_name):
             logger.debug("stable diffusion args: %s, model: %s", kwargs, model)
+            self._filter_kwargs(model, kwargs)
             images = model(**kwargs).images
 
         # revert padding if padded
@@ -328,11 +347,17 @@ def _gen_base64_image(_img):
             raise ValueError(f"Unsupported response format: {response_format}")
 
     @classmethod
-    def _filter_kwargs(cls, kwargs: dict):
+    def _filter_kwargs(cls, model, kwargs: dict):
         for arg in ["negative_prompt", "num_inference_steps"]:
             if not kwargs.get(arg):
                 kwargs.pop(arg, None)
 
+        for key in list(kwargs):
+            allow_key = model_accept_param(key, model)
+            if not allow_key:
+                warnings.warn(f"{type(model)} cannot accept `{key}`, will ignore it")
+                kwargs.pop(key)
+
     def text_to_image(
         self,
         prompt: str,
@@ -346,7 +371,6 @@ def text_to_image(
         width, height = map(int, re.split(r"[^\d]+", size))
         generate_kwargs = self._model_spec.default_generate_config.copy()  # type: ignore
         generate_kwargs.update({k: v for k, v in kwargs.items() if v is not None})
-        self._filter_kwargs(generate_kwargs)
         return self._call_model(
             prompt=prompt,
             height=height,
@@ -368,7 +392,6 @@ def image_to_image(
         self,
         image: PIL.Image,
         prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
         n: int = 1,
         size: Optional[str] = None,
         response_format: str = "url",
@@ -404,19 +427,10 @@ def image_to_image(
             kwargs["height"] = height
         else:
             # SD3 image2image cannot accept width and height
-            parameters = inspect.signature(model.__call__).parameters  # type: ignore
-            allow_width_height = False
-            for param in parameters.values():
-                if param.kind == inspect.Parameter.VAR_KEYWORD:
-                    allow_width_height = True
-                    break
-            if "width" in parameters or "height" in parameters:
-                allow_width_height = True
+            allow_width_height = model_accept_param(["width", "height"], model)
             if allow_width_height:
                 kwargs["width"], kwargs["height"] = image.size
 
-        kwargs["negative_prompt"] = negative_prompt
-        self._filter_kwargs(kwargs)
         return self._call_model(
             image=image,
             prompt=prompt,
@@ -431,7 +445,6 @@ def inpainting(
         image: PIL.Image,
         mask_image: PIL.Image,
         prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
         n: int = 1,
         size: str = "1024*1024",
         response_format: str = "url",
@@ -469,8 +482,6 @@ def inpainting(
             # calculate actual image size after padding
             width, height = image.size
 
-        kwargs["negative_prompt"] = negative_prompt
-        self._filter_kwargs(kwargs)
         return self._call_model(
             image=image,
             mask_image=mask_image,

From 67bd4db700b060948a3b77d6f53920b90c947a6d Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 20 Sep 2024 16:11:16 +0800
Subject: [PATCH 16/17] DOC: update models for doc and readme (#2330)

---
 README.md                                     |   8 +-
 README_zh_CN.md                               |   8 +-
 doc/source/getting_started/installation.rst   |   4 +-
 .../builtin/audio/fishspeech-1.2-sft.rst      |  19 -
 .../models/builtin/audio/fishspeech-1.4.rst   |  19 +
 doc/source/models/builtin/audio/index.rst     |   2 +-
 .../models/builtin/image/flux.1-dev.rst       |   2 +-
 .../models/builtin/image/flux.1-schnell.rst   |   2 +-
 .../builtin/llm/deepseek-v2-chat-0628.rst     |  31 ++
 .../models/builtin/llm/deepseek-v2-chat.rst   |  47 ++
 .../models/builtin/llm/deepseek-v2.5.rst      |  31 ++
 doc/source/models/builtin/llm/deepseek-v2.rst |  47 ++
 doc/source/models/builtin/llm/index.rst       |  63 +++
 .../builtin/llm/qwen2-audio-instruct.rst      |  31 ++
 doc/source/models/builtin/llm/qwen2-audio.rst |  31 ++
 .../models/builtin/llm/qwen2.5-instruct.rst   | 463 ++++++++++++++++++
 .../models/builtin/llm/yi-coder-chat.rst      |  47 ++
 doc/source/models/builtin/llm/yi-coder.rst    |  47 ++
 doc/source/user_guide/backends.rst            |   4 +-
 19 files changed, 874 insertions(+), 32 deletions(-)
 delete mode 100644 doc/source/models/builtin/audio/fishspeech-1.2-sft.rst
 create mode 100644 doc/source/models/builtin/audio/fishspeech-1.4.rst
 create mode 100644 doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
 create mode 100644 doc/source/models/builtin/llm/deepseek-v2-chat.rst
 create mode 100644 doc/source/models/builtin/llm/deepseek-v2.5.rst
 create mode 100644 doc/source/models/builtin/llm/deepseek-v2.rst
 create mode 100644 doc/source/models/builtin/llm/qwen2-audio-instruct.rst
 create mode 100644 doc/source/models/builtin/llm/qwen2-audio.rst
 create mode 100644 doc/source/models/builtin/llm/qwen2.5-instruct.rst
 create mode 100644 doc/source/models/builtin/llm/yi-coder-chat.rst
 create mode 100644 doc/source/models/builtin/llm/yi-coder.rst

diff --git a/README.md b/README.md
index 576dff498e..f478bfd37b 100644
--- a/README.md
+++ b/README.md
@@ -34,14 +34,14 @@ potential of cutting-edge AI models.
 - Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929)
 - Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906)
 ### New Models
+- Built-in support for [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325)
+- Built-in support for [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295)
+- Built-in support for [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292)
+- Built-in support for [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271)
 - Built-in support for [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205)
 - Built-in support for [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263)
 - Built-in support for [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
 - Built-in support for [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
-- Built-in support for [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
-- Built-in support for [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
-- Built-in support for [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
-- Built-in support for [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
 ### Integrations
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
 - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
diff --git a/README_zh_CN.md b/README_zh_CN.md
index 08a1f80b27..cd155e3997 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -31,14 +31,14 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 - 支持语音识别模型: [#929](https://github.com/xorbitsai/inference/pull/929)
 - 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906)
 ### 新模型
+- 内置 [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325)
+- 内置 [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295)
+- 内置 [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292)
+- 内置 [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271)
 - 内置 [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205)
 - 内置 [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263)
 - 内置 [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
 - 内置 [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
-- 内置 [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
-- 内置 [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
-- 内置 [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
-- 内置 [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
 ### 集成
 - [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/)：一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力，帮助您轻松实现复杂的问答场景。
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
index e52384bee7..8490f93439 100644
--- a/doc/source/getting_started/installation.rst
+++ b/doc/source/getting_started/installation.rst
@@ -44,7 +44,8 @@ Currently, supported models include:
 - ``codestral-v0.1``
 - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
 - ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``yi-coder``, ``yi-coder-chat``
 - ``codeqwen1.5``, ``codeqwen1.5-chat``
 - ``baichuan-2-chat``
 - ``internlm2-chat``
@@ -56,6 +57,7 @@ Currently, supported models include:
 - ``codegeex4``
 - ``qwen1.5-chat``, ``qwen1.5-moe-chat``
 - ``qwen2-instruct``, ``qwen2-moe-instruct``
+- ``qwen2.5-instruct``
 - ``gemma-it``, ``gemma-2-it``
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``
diff --git a/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst b/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst
deleted file mode 100644
index 3afac1f7e3..0000000000
--- a/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. _models_builtin_fishspeech-1.2-sft:
-
-==================
-FishSpeech-1.2-SFT
-==================
-
-- **Model Name:** FishSpeech-1.2-SFT
-- **Model Family:** FishAudio
-- **Abilities:** text-to-audio
-- **Multilingual:** True
-
-Specifications
-^^^^^^^^^^^^^^
-
-- **Model ID:** fishaudio/fish-speech-1.2-sft
-
-Execute the following command to launch the model::
-
-   xinference launch --model-name FishSpeech-1.2-SFT --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/fishspeech-1.4.rst b/doc/source/models/builtin/audio/fishspeech-1.4.rst
new file mode 100644
index 0000000000..c256495d67
--- /dev/null
+++ b/doc/source/models/builtin/audio/fishspeech-1.4.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_fishspeech-1.4:
+
+==============
+FishSpeech-1.4
+==============
+
+- **Model Name:** FishSpeech-1.4
+- **Model Family:** FishAudio
+- **Abilities:** text-to-audio
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** fishaudio/fish-speech-1.4
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name FishSpeech-1.4 --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/index.rst b/doc/source/models/builtin/audio/index.rst
index 8959b2b94f..d4b6b886ac 100644
--- a/doc/source/models/builtin/audio/index.rst
+++ b/doc/source/models/builtin/audio/index.rst
@@ -25,7 +25,7 @@ The following is a list of built-in audio models in Xinference:
   
    cosyvoice-300m-sft
   
-   fishspeech-1.2-sft
+   fishspeech-1.4
   
    sensevoicesmall
   
diff --git a/doc/source/models/builtin/image/flux.1-dev.rst b/doc/source/models/builtin/image/flux.1-dev.rst
index 829bcbfd75..3a16cfe0a7 100644
--- a/doc/source/models/builtin/image/flux.1-dev.rst
+++ b/doc/source/models/builtin/image/flux.1-dev.rst
@@ -6,7 +6,7 @@ FLUX.1-dev
 
 - **Model Name:** FLUX.1-dev
 - **Model Family:** stable_diffusion
-- **Abilities:** text2image
+- **Abilities:** text2image, image2image, inpainting
 - **Available ControlNet:** None
 
 Specifications
diff --git a/doc/source/models/builtin/image/flux.1-schnell.rst b/doc/source/models/builtin/image/flux.1-schnell.rst
index 268f5a1720..df82d2069f 100644
--- a/doc/source/models/builtin/image/flux.1-schnell.rst
+++ b/doc/source/models/builtin/image/flux.1-schnell.rst
@@ -6,7 +6,7 @@ FLUX.1-schnell
 
 - **Model Name:** FLUX.1-schnell
 - **Model Family:** stable_diffusion
-- **Abilities:** text2image
+- **Abilities:** text2image, image2image, inpainting
 - **Available ControlNet:** None
 
 Specifications
diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
new file mode 100644
index 0000000000..d6e91cb248
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
@@ -0,0 +1,31 @@
+.. _models_llm_deepseek-v2-chat-0628:
+
+========================================
+deepseek-v2-chat-0628
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2-chat-0628
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. 
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Chat-0628
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat-0628>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-v2-chat-0628 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat.rst b/doc/source/models/builtin/llm/deepseek-v2-chat.rst
new file mode 100644
index 0000000000..84595c2bbb
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2-chat.rst
@@ -0,0 +1,47 @@
+.. _models_llm_deepseek-v2-chat:
+
+========================================
+deepseek-v2-chat
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. 
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 16
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Lite-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 16 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2.5.rst b/doc/source/models/builtin/llm/deepseek-v2.5.rst
new file mode 100644
index 0000000000..5f5b9475d4
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2.5.rst
@@ -0,0 +1,31 @@
+.. _models_llm_deepseek-v2.5:
+
+========================================
+deepseek-v2.5
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2.5
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2.5
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-V2.5>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-v2.5 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2.rst b/doc/source/models/builtin/llm/deepseek-v2.rst
new file mode 100644
index 0000000000..4102b9568c
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2.rst
@@ -0,0 +1,47 @@
+.. _models_llm_deepseek-v2:
+
+========================================
+deepseek-v2
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. 
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 16
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: Transformers
+- **Model ID:** deepseek-ai/DeepSeek-V2-Lite
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 16 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: Transformers
+- **Model ID:** deepseek-ai/DeepSeek-V2
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-V2>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-V2>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index bab4b1093d..73bd2b9894 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -126,6 +126,26 @@ The following is a list of built-in LLM in Xinference:
      - 16384
      - deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data.
 
+   * - :ref:`deepseek-v2 <models_llm_deepseek-v2>`
+     - generate
+     - 128000
+     - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. 
+
+   * - :ref:`deepseek-v2-chat <models_llm_deepseek-v2-chat>`
+     - chat
+     - 128000
+     - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. 
+
+   * - :ref:`deepseek-v2-chat-0628 <models_llm_deepseek-v2-chat-0628>`
+     - chat
+     - 128000
+     - DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. 
+
+   * - :ref:`deepseek-v2.5 <models_llm_deepseek-v2.5>`
+     - chat
+     - 128000
+     - DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.
+
    * - :ref:`deepseek-vl-chat <models_llm_deepseek-vl-chat>`
      - chat, vision
      - 4096
@@ -371,6 +391,16 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.
 
+   * - :ref:`qwen2-audio <models_llm_qwen2-audio>`
+     - chat, audio
+     - 32768
+     - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+   * - :ref:`qwen2-audio-instruct <models_llm_qwen2-audio-instruct>`
+     - chat, audio
+     - 32768
+     - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
    * - :ref:`qwen2-instruct <models_llm_qwen2-instruct>`
      - chat, tools
      - 32768
@@ -386,6 +416,11 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - Qwen2-VL: To See the World More Clearly.Qwen2-VL is the latest version of the vision language models in the Qwen model familities.
 
+   * - :ref:`qwen2.5-instruct <models_llm_qwen2.5-instruct>`
+     - chat, tools
+     - 131072
+     - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
    * - :ref:`seallm_v2 <models_llm_seallm_v2>`
      - generate
      - 8192
@@ -471,6 +506,16 @@ The following is a list of built-in LLM in Xinference:
      - 4096
      - The Yi series models are large language models trained from scratch by developers at 01.AI.
 
+   * - :ref:`yi-coder <models_llm_yi-coder>`
+     - generate
+     - 131072
+     - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+   * - :ref:`yi-coder-chat <models_llm_yi-coder-chat>`
+     - chat
+     - 131072
+     - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
    * - :ref:`yi-vl-chat <models_llm_yi-vl-chat>`
      - chat, vision
      - 4096
@@ -525,6 +570,14 @@ The following is a list of built-in LLM in Xinference:
   
    deepseek-coder-instruct
   
+   deepseek-v2
+  
+   deepseek-v2-chat
+  
+   deepseek-v2-chat-0628
+  
+   deepseek-v2.5
+  
    deepseek-vl-chat
   
    gemma-2-it
@@ -623,12 +676,18 @@ The following is a list of built-in LLM in Xinference:
   
    qwen1.5-moe-chat
   
+   qwen2-audio
+  
+   qwen2-audio-instruct
+  
    qwen2-instruct
   
    qwen2-moe-instruct
   
    qwen2-vl-instruct
   
+   qwen2.5-instruct
+  
    seallm_v2
   
    seallm_v2.5
@@ -663,6 +722,10 @@ The following is a list of built-in LLM in Xinference:
   
    yi-chat
   
+   yi-coder
+  
+   yi-coder-chat
+  
    yi-vl-chat
   
 
diff --git a/doc/source/models/builtin/llm/qwen2-audio-instruct.rst b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst
new file mode 100644
index 0000000000..2d126a387e
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst
@@ -0,0 +1,31 @@
+.. _models_llm_qwen2-audio-instruct:
+
+========================================
+qwen2-audio-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-audio-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, audio
+- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-Audio-7B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2-audio-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2-audio.rst b/doc/source/models/builtin/llm/qwen2-audio.rst
new file mode 100644
index 0000000000..2973390c44
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-audio.rst
@@ -0,0 +1,31 @@
+.. _models_llm_qwen2-audio:
+
+========================================
+qwen2-audio
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-audio
+- **Languages:** en, zh
+- **Abilities:** chat, audio
+- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-Audio-7B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2-Audio-7B>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-Audio-7B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2-audio --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2.5-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-instruct.rst
new file mode 100644
index 0000000000..6e6b4db35e
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2.5-instruct.rst
@@ -0,0 +1,463 @@
+.. _models_llm_qwen2.5-instruct:
+
+========================================
+qwen2.5-instruct
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** qwen2.5-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, tools
+- **Description:** Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 3
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-3B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 5 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-14B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 6 (pytorch, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 7 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-72B-Instruct>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 8 (gptq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 9 (gptq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 10 (gptq, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 3
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 11 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 12 (gptq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 13 (gptq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 14 (gptq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 15 (awq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 16 (awq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 17 (awq, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 3
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format awq --quantization ${quantization}
+
+
+Model Spec 18 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format awq --quantization ${quantization}
+
+
+Model Spec 19 (awq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format awq --quantization ${quantization}
+
+
+Model Spec 20 (awq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format awq --quantization ${quantization}
+
+
+Model Spec 21 (awq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format awq --quantization ${quantization}
+
+
+Model Spec 22 (ggufv2, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 0_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 23 (ggufv2, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 24 (ggufv2, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 3
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 25 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 26 (ggufv2, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 14
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 27 (ggufv2, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 32
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 28 (ggufv2, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 72
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/yi-coder-chat.rst b/doc/source/models/builtin/llm/yi-coder-chat.rst
new file mode 100644
index 0000000000..af4368ae98
--- /dev/null
+++ b/doc/source/models/builtin/llm/yi-coder-chat.rst
@@ -0,0 +1,47 @@
+.. _models_llm_yi-coder-chat:
+
+========================================
+yi-coder-chat
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** yi-coder-chat
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 9 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 9
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01ai/Yi-Coder-9B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/01ai/Yi-Coder-9B-Chat>`__, `ModelScope <https://modelscope.cn/models/01ai/Yi-Coder-9B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name yi-coder-chat --size-in-billions 9 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01ai/Yi-Coder-1.5B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/01ai/Yi-Coder-1.5B-Chat>`__, `ModelScope <https://modelscope.cn/models/01ai/Yi-Coder-1.5B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name yi-coder-chat --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/yi-coder.rst b/doc/source/models/builtin/llm/yi-coder.rst
new file mode 100644
index 0000000000..347a3bc9d1
--- /dev/null
+++ b/doc/source/models/builtin/llm/yi-coder.rst
@@ -0,0 +1,47 @@
+.. _models_llm_yi-coder:
+
+========================================
+yi-coder
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** yi-coder
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 9 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 9
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01-ai/Yi-Coder-9B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/01-ai/Yi-Coder-9B>`__, `ModelScope <https://modelscope.cn/models/01ai/Yi-Coder-9B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name yi-coder --size-in-billions 9 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01-ai/Yi-Coder-1.5B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/01-ai/Yi-Coder-1.5B>`__, `ModelScope <https://modelscope.cn/models/01ai/Yi-Coder-1.5B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name yi-coder --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
index 57126871e8..2cbb924e03 100644
--- a/doc/source/user_guide/backends.rst
+++ b/doc/source/user_guide/backends.rst
@@ -51,7 +51,8 @@ Currently, supported model includes:
 - ``codestral-v0.1``
 - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
 - ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``yi-coder``, ``yi-coder-chat``
 - ``codeqwen1.5``, ``codeqwen1.5-chat``
 - ``baichuan-2-chat``
 - ``internlm2-chat``
@@ -63,6 +64,7 @@ Currently, supported model includes:
 - ``codegeex4``
 - ``qwen1.5-chat``, ``qwen1.5-moe-chat``
 - ``qwen2-instruct``, ``qwen2-moe-instruct``
+- ``qwen2.5-instruct``
 - ``gemma-it``, ``gemma-2-it``
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``

From 5de46e94c23785fa7e17e3e1d00c3afb6cb1c919 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Fri, 20 Sep 2024 16:58:06 +0800
Subject: [PATCH 17/17] FEAT: support qwen2.5-coder-instruct and qwen2.5 sglang
 (#2332)

Co-authored-by: wuzhaoxin <15667065080@162.com>
---
 xinference/model/llm/llm_family.json          | 550 ++++++++++++++++-
 .../model/llm/llm_family_modelscope.json      | 565 +++++++++++++++++-
 xinference/model/llm/sglang/core.py           |   4 +
 xinference/model/llm/vllm/core.py             |   4 +
 4 files changed, 1062 insertions(+), 61 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 70b17daa61..471b4febc3 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -6874,7 +6874,7 @@
         "model_id":"Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
         "model_revision":"3d152a77eaccfd72d59baedb0b183a1b8fd56e48"
       },
-        {
+      {
         "model_format":"gptq",
         "model_size_in_billions":7,
         "quantizations":[
@@ -6883,7 +6883,7 @@
         "model_id":"Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
         "model_revision":"5ab897112fa83b9699826be8753ef9184585c77d"
       },
-        {
+      {
         "model_format":"awq",
         "model_size_in_billions":7,
         "quantizations":[
@@ -6891,6 +6891,31 @@
         ],
         "model_id":"Qwen/Qwen2-VL-7B-Instruct-AWQ",
         "model_revision":"f94216e8b513933bccd567bcd9b7350199f32538"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2-VL-72B-Instruct"
+      },
+      {
+        "model_format":"awq",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"Qwen/Qwen2-VL-72B-Instruct-AWQ"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "Int4",
+          "Int8"
+        ],
+        "model_id":"Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}"
       }
     ],
     "prompt_style":{
@@ -7247,7 +7272,99 @@
   },
   {
     "version": 1,
-    "context_length": 131072,
+    "context_length": 32768,
+    "model_name": "qwen2.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-0.5B",
+        "model_revision": "2630d3d2321bc1f1878f702166d1b2af019a7310"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-1.5B",
+        "model_revision": "e5dfabbcffd9b0c7b31d89b82c5a6b72e663f32c"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-3B",
+        "model_revision": "e4aa5ac50aa507415cda96cc99eb77ad0a3d2d34"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-7B",
+        "model_revision": "09a0bac5707b43ec44508eab308b0846320c1ed4"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-14B",
+        "model_revision": "d02b64ba1ce86bf9948668a13f82709600431ccc"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-32B",
+        "model_revision": "ff23665d01c3665be5fdb271d18a62090b65c06d"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-72B",
+        "model_revision": "587cc4061cf6a7cc0d429d05c109447e5cf063af"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
     "model_name": "qwen2.5-instruct",
     "model_lang": [
       "en",
@@ -7459,11 +7576,10 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf"
+        "model_file_name_template": "qwen2.5-0.5b-instruct-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -7476,11 +7592,10 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf"
+        "model_file_name_template": "qwen2.5-1.5b-instruct-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -7493,11 +7608,10 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf"
+        "model_file_name_template": "qwen2.5-3b-instruct-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -7510,11 +7624,37 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf"
+        "model_file_name_template": "qwen2.5-7b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q4_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q4_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ]
+        }
       },
       {
         "model_format": "ggufv2",
@@ -7527,11 +7667,53 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf"
+        "model_file_name_template": "qwen2.5-14b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-14b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q2_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q3_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q4_0": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q4_k_m": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q5_0": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q5_k_m": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q6_k": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "q8_0": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ]
+        }
       },
       {
         "model_format": "ggufv2",
@@ -7544,11 +7726,76 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf"
+        "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-32b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q2_k": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "q3_k_m": [
+            "00001-of-00005",
+            "00002-of-00005",
+            "00003-of-00005",
+            "00004-of-00005",
+            "00005-of-00005"
+          ],
+          "q4_0": [
+            "00001-of-00005",
+            "00002-of-00005",
+            "00003-of-00005",
+            "00004-of-00005",
+            "00005-of-00005"
+          ],
+          "q4_k_m": [
+            "00001-of-00005",
+            "00002-of-00005",
+            "00003-of-00005",
+            "00004-of-00005",
+            "00005-of-00005"
+          ],
+          "q5_0": [
+            "00001-of-00006",
+            "00002-of-00006",
+            "00003-of-00006",
+            "00004-of-00006",
+            "00005-of-00006",
+            "00006-of-00006"
+          ],
+          "q5_k_m": [
+            "00001-of-00006",
+            "00002-of-00006",
+            "00003-of-00006",
+            "00004-of-00006",
+            "00005-of-00006",
+            "00006-of-00006"
+          ],
+          "q6_k": [
+            "00001-of-00007",
+            "00002-of-00007",
+            "00003-of-00007",
+            "00004-of-00007",
+            "00005-of-00007",
+            "00006-of-00007",
+            "00007-of-00007"
+          ],
+          "q8_0": [
+            "00001-of-00009",
+            "00002-of-00009",
+            "00003-of-00009",
+            "00004-of-00009",
+            "00005-of-00009",
+            "00006-of-00009",
+            "00007-of-00009",
+            "00008-of-00009",
+            "00009-of-00009"
+          ]
+        }
       },
       {
         "model_format": "ggufv2",
@@ -7566,8 +7813,254 @@
         ],
         "model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF",
         "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
-        "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+        "model_file_name_split_template": "qwen2.5-72b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+           "q2_k": [
+            "00001-of-00007",
+            "00002-of-00007",
+            "00003-of-00007",
+            "00004-of-00007",
+            "00005-of-00007",
+            "00006-of-00007",
+            "00007-of-00007"
+          ],
+          "q3_k_m": [
+            "00001-of-00009",
+            "00002-of-00009",
+            "00003-of-00009",
+            "00004-of-00009",
+            "00005-of-00009",
+            "00006-of-00009",
+            "00007-of-00009",
+            "00008-of-00009",
+            "00009-of-00009"
+          ],
+          "q4_0": [
+            "00001-of-00011",
+            "00002-of-00011",
+            "00003-of-00011",
+            "00004-of-00011",
+            "00005-of-00011",
+            "00006-of-00011",
+            "00007-of-00011",
+            "00008-of-00011",
+            "00009-of-00011",
+            "00010-of-00011",
+            "00011-of-00011"
+          ],
+          "q4_k_m": [
+            "00001-of-00012",
+            "00002-of-00012",
+            "00003-of-00012",
+            "00004-of-00012",
+            "00005-of-00012",
+            "00006-of-00012",
+            "00007-of-00012",
+            "00008-of-00012",
+            "00009-of-00012",
+            "00010-of-00012",
+            "00011-of-00012",
+            "00012-of-00012"
+          ],
+          "q5_0": [
+            "00001-of-00013",
+            "00002-of-00013",
+            "00003-of-00013",
+            "00004-of-00013",
+            "00005-of-00013",
+            "00006-of-00013",
+            "00007-of-00013",
+            "00008-of-00013",
+            "00009-of-00013",
+            "00010-of-00013",
+            "00011-of-00013",
+            "00012-of-00013",
+            "00013-of-00013"
+          ],
+          "q5_k_m": [
+            "00001-of-00014",
+            "00002-of-00014",
+            "00003-of-00014",
+            "00004-of-00014",
+            "00005-of-00014",
+            "00006-of-00014",
+            "00007-of-00014",
+            "00008-of-00014",
+            "00009-of-00014",
+            "00010-of-00014",
+            "00011-of-00014",
+            "00012-of-00014",
+            "00013-of-00014",
+            "00014-of-00014"
+          ],
+          "q6_k": [
+            "00001-of-00016",
+            "00002-of-00016",
+            "00003-of-00016",
+            "00004-of-00016",
+            "00005-of-00016",
+            "00006-of-00016",
+            "00007-of-00016",
+            "00008-of-00016",
+            "00009-of-00016",
+            "00010-of-00016",
+            "00011-of-00016",
+            "00012-of-00016",
+            "00013-of-00016",
+            "00014-of-00016",
+            "00015-of-00016",
+            "00016-of-00016"
+          ],
+          "q8_0": [
+            "00001-of-00021",
+            "00002-of-00021",
+            "00003-of-00021",
+            "00004-of-00021",
+            "00005-of-00021",
+            "00006-of-00021",
+            "00007-of-00021",
+            "00008-of-00021",
+            "00009-of-00021",
+            "00010-of-00021",
+            "00011-of-00021",
+            "00012-of-00021",
+            "00013-of-00021",
+            "00014-of-00021",
+            "00015-of-00021",
+            "00016-of-00021",
+            "00017-of-00021",
+            "00018-of-00021",
+            "00019-of-00021",
+            "00020-of-00021",
+            "00021-of-00021"
+          ]
+        }
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2.5-coder",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-1.5B",
+        "model_revision": "d3586cfe793730945f8e4d7ef31032a3ee50247d"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-7B",
+        "model_revision": "30b6a7e874a78d46b80fa1db3194ea427dd41b08"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2.5-coder-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2.5-coder-1.5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
+        "model_file_name_template": "qwen2.5-coder-7b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-coder-7b-instruct-{quantization}-{part}.gguf",
         "quantization_parts": {
+          "q4_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q4_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
           "q5_0": [
             "00001-of-00002",
             "00002-of-00002"
@@ -7581,19 +8074,14 @@
             "00002-of-00002"
           ],
           "q8_0": [
-            "00001-of-00002",
-            "00002-of-00002"
-          ],
-          "fp16": [
-            "00001-of-00004",
-            "00002-of-00004",
-            "00003-of-00004",
-            "00004-of-00004"
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
           ]
         }
       }
     ],
-    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
     "stop_token_ids": [
       151643,
       151644,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 7309ee9651..daf726e8c7 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4602,6 +4602,34 @@
         "model_hub": "modelscope",
         "model_id":"qwen/Qwen2-VL-2B-Instruct-AWQ",
         "model_revision":"master"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"qwen/Qwen2-VL-72B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"awq",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"qwen/Qwen2-VL-72B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "Int4",
+          "Int8"
+        ],
+        "model_id":"qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
       }
     ],
     "prompt_style": {
@@ -4960,7 +4988,106 @@
   },
   {
     "version": 1,
-    "context_length": 131072,
+    "context_length": 32768,
+    "model_name": "qwen2.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-0.5B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-1.5B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-3B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-7B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-14B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-32B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-72B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
     "model_name": "qwen2.5-instruct",
     "model_lang": [
       "en",
@@ -5193,11 +5320,10 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf",
+        "model_file_name_template": "qwen2.5-0.5b-instruct-{quantization}.gguf",
         "model_hub": "modelscope"
       },
       {
@@ -5211,11 +5337,10 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf",
+        "model_file_name_template": "qwen2.5-1.5b-instruct-{quantization}.gguf",
         "model_hub": "modelscope"
       },
       {
@@ -5229,11 +5354,10 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-3B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf",
+        "model_file_name_template": "qwen2.5-3b-instruct-{quantization}.gguf",
         "model_hub": "modelscope"
       },
       {
@@ -5247,12 +5371,38 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
         "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf",
-        "model_hub": "modelscope"
+        "model_hub": "modelscope",
+        "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q4_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q4_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ]
+        }
       },
       {
         "model_format": "ggufv2",
@@ -5265,11 +5415,53 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-14B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf",
+        "model_file_name_template": "qwen2.5-14b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-14b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q2_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q3_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q4_0": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q4_k_m": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q5_0": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q5_k_m": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "q6_k": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "q8_0": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ]
+        },
         "model_hub": "modelscope"
       },
       {
@@ -5283,11 +5475,76 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-32B-Instruct-GGUF",
         "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-32b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q2_k": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "q3_k_m": [
+            "00001-of-00005",
+            "00002-of-00005",
+            "00003-of-00005",
+            "00004-of-00005",
+            "00005-of-00005"
+          ],
+          "q4_0": [
+            "00001-of-00005",
+            "00002-of-00005",
+            "00003-of-00005",
+            "00004-of-00005",
+            "00005-of-00005"
+          ],
+          "q4_k_m": [
+            "00001-of-00005",
+            "00002-of-00005",
+            "00003-of-00005",
+            "00004-of-00005",
+            "00005-of-00005"
+          ],
+          "q5_0": [
+            "00001-of-00006",
+            "00002-of-00006",
+            "00003-of-00006",
+            "00004-of-00006",
+            "00005-of-00006",
+            "00006-of-00006"
+          ],
+          "q5_k_m": [
+            "00001-of-00006",
+            "00002-of-00006",
+            "00003-of-00006",
+            "00004-of-00006",
+            "00005-of-00006",
+            "00006-of-00006"
+          ],
+          "q6_k": [
+            "00001-of-00007",
+            "00002-of-00007",
+            "00003-of-00007",
+            "00004-of-00007",
+            "00005-of-00007",
+            "00006-of-00007",
+            "00007-of-00007"
+          ],
+          "q8_0": [
+            "00001-of-00009",
+            "00002-of-00009",
+            "00003-of-00009",
+            "00004-of-00009",
+            "00005-of-00009",
+            "00006-of-00009",
+            "00007-of-00009",
+            "00008-of-00009",
+            "00009-of-00009"
+          ]
+        },
         "model_hub": "modelscope"
       },
       {
@@ -5301,40 +5558,288 @@
           "q5_0",
           "q5_k_m",
           "q6_k",
-          "q8_0",
-          "fp16"
+          "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-72B-Instruct-GGUF",
         "model_hub": "modelscope",
         "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
-        "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+        "model_file_name_split_template": "qwen2.5-72b-instruct-{quantization}-{part}.gguf",
         "quantization_parts": {
+           "q2_k": [
+            "00001-of-00007",
+            "00002-of-00007",
+            "00003-of-00007",
+            "00004-of-00007",
+            "00005-of-00007",
+            "00006-of-00007",
+            "00007-of-00007"
+          ],
+          "q3_k_m": [
+            "00001-of-00009",
+            "00002-of-00009",
+            "00003-of-00009",
+            "00004-of-00009",
+            "00005-of-00009",
+            "00006-of-00009",
+            "00007-of-00009",
+            "00008-of-00009",
+            "00009-of-00009"
+          ],
+          "q4_0": [
+            "00001-of-00011",
+            "00002-of-00011",
+            "00003-of-00011",
+            "00004-of-00011",
+            "00005-of-00011",
+            "00006-of-00011",
+            "00007-of-00011",
+            "00008-of-00011",
+            "00009-of-00011",
+            "00010-of-00011",
+            "00011-of-00011"
+          ],
+          "q4_k_m": [
+            "00001-of-00012",
+            "00002-of-00012",
+            "00003-of-00012",
+            "00004-of-00012",
+            "00005-of-00012",
+            "00006-of-00012",
+            "00007-of-00012",
+            "00008-of-00012",
+            "00009-of-00012",
+            "00010-of-00012",
+            "00011-of-00012",
+            "00012-of-00012"
+          ],
           "q5_0": [
+            "00001-of-00013",
+            "00002-of-00013",
+            "00003-of-00013",
+            "00004-of-00013",
+            "00005-of-00013",
+            "00006-of-00013",
+            "00007-of-00013",
+            "00008-of-00013",
+            "00009-of-00013",
+            "00010-of-00013",
+            "00011-of-00013",
+            "00012-of-00013",
+            "00013-of-00013"
+          ],
+          "q5_k_m": [
+            "00001-of-00014",
+            "00002-of-00014",
+            "00003-of-00014",
+            "00004-of-00014",
+            "00005-of-00014",
+            "00006-of-00014",
+            "00007-of-00014",
+            "00008-of-00014",
+            "00009-of-00014",
+            "00010-of-00014",
+            "00011-of-00014",
+            "00012-of-00014",
+            "00013-of-00014",
+            "00014-of-00014"
+          ],
+          "q6_k": [
+            "00001-of-00016",
+            "00002-of-00016",
+            "00003-of-00016",
+            "00004-of-00016",
+            "00005-of-00016",
+            "00006-of-00016",
+            "00007-of-00016",
+            "00008-of-00016",
+            "00009-of-00016",
+            "00010-of-00016",
+            "00011-of-00016",
+            "00012-of-00016",
+            "00013-of-00016",
+            "00014-of-00016",
+            "00015-of-00016",
+            "00016-of-00016"
+          ],
+          "q8_0": [
+            "00001-of-00021",
+            "00002-of-00021",
+            "00003-of-00021",
+            "00004-of-00021",
+            "00005-of-00021",
+            "00006-of-00021",
+            "00007-of-00021",
+            "00008-of-00021",
+            "00009-of-00021",
+            "00010-of-00021",
+            "00011-of-00021",
+            "00012-of-00021",
+            "00013-of-00021",
+            "00014-of-00021",
+            "00015-of-00021",
+            "00016-of-00021",
+            "00017-of-00021",
+            "00018-of-00021",
+            "00019-of-00021",
+            "00020-of-00021",
+            "00021-of-00021"
+          ]
+        }
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2.5-coder",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-1.5B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-7B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2.5-coder-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-7B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2.5-coder-1.5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
+        "model_file_name_template": "qwen2.5-coder-7b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2.5-coder-7b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q4_0": [
             "00001-of-00002",
             "00002-of-00002"
           ],
-          "q5_k_m": [
+          "q4_k_m": [
             "00001-of-00002",
             "00002-of-00002"
           ],
-          "q6_k": [
+          "q5_0": [
             "00001-of-00002",
             "00002-of-00002"
           ],
-          "q8_0": [
+          "q5_k_m": [
             "00001-of-00002",
             "00002-of-00002"
           ],
-          "fp16": [
-            "00001-of-00004",
-            "00002-of-00004",
-            "00003-of-00004",
-            "00004-of-00004"
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
           ]
         }
       }
     ],
-    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
     "stop_token_ids": [
       151643,
       151644,
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 621b9b0a59..a413f2ad0f 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -68,6 +68,8 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     "llama-3.1",
     "mistral-v0.1",
     "mixtral-v0.1",
+    "qwen2.5",
+    "qwen2.5-coder",
 ]
 SGLANG_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
@@ -85,6 +87,8 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     "deepseek-v2.5",
     "deepseek-v2-chat",
     "deepseek-v2-chat-0628",
+    "qwen2.5-instruct",
+    "qwen2.5-coder-instruct",
 ]
 
 
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 3aaee0738f..8b28701778 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -138,7 +138,11 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
     VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
+    VLLM_SUPPORTED_MODELS.append("qwen2.5")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
+    VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
+
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")