From 56de933d3cc03761898eff66ec83f71b024f83f6 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:51:31 +0800
Subject: [PATCH 01/17] FEAT: support deepseek-v2 and 2.5 (#2292)
Co-authored-by: wuzhaoxin <15667065080@162.com>
---
xinference/model/llm/__init__.py | 6 +
xinference/model/llm/llm_family.json | 147 ++++++++
.../model/llm/llm_family_modelscope.json | 153 ++++++++
xinference/model/llm/sglang/core.py | 3 +
xinference/model/llm/transformers/core.py | 4 +
.../model/llm/transformers/deepseek_v2.py | 340 ++++++++++++++++++
xinference/model/llm/utils.py | 26 ++
xinference/model/llm/vllm/core.py | 6 +
8 files changed, 685 insertions(+)
create mode 100644 xinference/model/llm/transformers/deepseek_v2.py
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index 1980a4b81f..5a7895eb1a 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -136,6 +136,10 @@ def _install():
from .transformers.cogvlm2 import CogVLM2Model
from .transformers.cogvlm2_video import CogVLM2VideoModel
from .transformers.core import PytorchChatModel, PytorchModel
+ from .transformers.deepseek_v2 import (
+ DeepSeekV2PytorchChatModel,
+ DeepSeekV2PytorchModel,
+ )
from .transformers.deepseek_vl import DeepSeekVLChatModel
from .transformers.glm4v import Glm4VModel
from .transformers.intern_vl import InternVLChatModel
@@ -182,6 +186,8 @@ def _install():
MiniCPMV25Model,
MiniCPMV26Model,
Glm4VModel,
+ DeepSeekV2PytorchModel,
+ DeepSeekV2PytorchChatModel,
]
)
if OmniLMMModel: # type: ignore
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 7f428ee005..e997098e65 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -6946,5 +6946,152 @@
"",
""
]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 16,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Lite",
+ "model_revision": "604d5664dddd88a0433dbae533b7fe9472482de0"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2",
+ "model_revision": "4461458f186c35188585855f28f77af5661ad489"
+ }
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2-chat",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 16,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+ "model_revision": "85864749cd611b4353ce1decdb286193298f64c7"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Chat",
+ "model_revision": "8e3f5f6c2226787e41ba3e9283a06389d178c926"
+ }
+ ],
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+ "stop_token_ids": [
+ 100001
+ ],
+ "stop": [
+ "<|end▁of▁sentence|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2-chat-0628",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. ",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Chat-0628",
+ "model_revision": "5d09e272c2b223830f4e84359cd9dd047a5d7c78"
+ }
+ ],
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %}",
+ "stop_token_ids": [
+ 100001
+ ],
+ "stop": [
+ "<|end▁of▁sentence|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2.5",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2.5",
+ "model_revision": "24b08cb750e0c2757de112d2e16327cb21ed4833"
+ }
+ ],
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %} {%- if message['role'] == 'system' %} {% set ns.system_prompt = message['content'] %} {%- endif %}{%- endfor %}{{'<|begin▁of▁sentence|>'}}{{ns.system_prompt}}{%- for message in messages %} {%- if message['role'] == 'user' %} {%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is none %} {%- set ns.is_tool = false -%} {%- for tool in message['tool_calls']%} {%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} {%- set ns.is_first = true -%} {%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} {%- endif %} {%- endfor %} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is not none %} {%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- set ns.is_tool = false -%} {%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- endif %} {%- endif %} {%- if message['role'] == 'tool' %} {%- set ns.is_tool = true -%} {%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- set ns.is_output_first = false %} {%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- endif %} {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
+ "stop_token_ids": [
+ 100001
+ ],
+ "stop": [
+ "<|end▁of▁sentence|>"
+ ]
}
]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index eb24dd8180..f4386e85fa 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4655,5 +4655,158 @@
"",
""
]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 16,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Lite",
+ "model_hub": "modelscope",
+ "model_revision": "master"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2",
+ "model_hub": "modelscope",
+ "model_revision": "master"
+ }
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2-chat",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 16,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+ "model_hub": "modelscope",
+ "model_revision": "master"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Chat",
+ "model_hub": "modelscope",
+ "model_revision": "master"
+ }
+ ],
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+ "stop_token_ids": [
+ 100001
+ ],
+ "stop": [
+ "<|end▁of▁sentence|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2-chat-0628",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. ",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2-Chat-0628",
+ "model_hub": "modelscope",
+ "model_revision": "master"
+ }
+ ],
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %}",
+ "stop_token_ids": [
+ 100001
+ ],
+ "stop": [
+ "<|end▁of▁sentence|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 128000,
+ "model_name": "deepseek-v2.5",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 236,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "deepseek-ai/DeepSeek-V2.5",
+ "model_hub": "modelscope",
+ "model_revision": "master"
+ }
+ ],
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %} {%- if message['role'] == 'system' %} {% set ns.system_prompt = message['content'] %} {%- endif %}{%- endfor %}{{'<|begin▁of▁sentence|>'}}{{ns.system_prompt}}{%- for message in messages %} {%- if message['role'] == 'user' %} {%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is none %} {%- set ns.is_tool = false -%} {%- for tool in message['tool_calls']%} {%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} {%- set ns.is_first = true -%} {%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} {%- endif %} {%- endfor %} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is not none %} {%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- set ns.is_tool = false -%} {%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- endif %} {%- endif %} {%- if message['role'] == 'tool' %} {%- set ns.is_tool = true -%} {%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- set ns.is_output_first = false %} {%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- endif %} {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
+ "stop_token_ids": [
+ 100001
+ ],
+ "stop": [
+ "<|end▁of▁sentence|>"
+ ]
}
]
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 578252324d..621b9b0a59 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -82,6 +82,9 @@ class SGLANGGenerateConfig(TypedDict, total=False):
"mixtral-instruct-v0.1",
"gemma-it",
"gemma-2-it",
+ "deepseek-v2.5",
+ "deepseek-v2-chat",
+ "deepseek-v2-chat-0628",
]
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 32419a56f1..a451b7accd 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -65,6 +65,10 @@
"MiniCPM-V-2.6",
"glm-4v",
"qwen2-vl-instruct",
+ "deepseek-v2",
+ "deepseek-v2-chat",
+ "deepseek-v2.5",
+ "deepseek-v2-chat-0628",
]
diff --git a/xinference/model/llm/transformers/deepseek_v2.py b/xinference/model/llm/transformers/deepseek_v2.py
new file mode 100644
index 0000000000..b6ce2b5e04
--- /dev/null
+++ b/xinference/model/llm/transformers/deepseek_v2.py
@@ -0,0 +1,340 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import uuid
+from typing import Dict, Iterator, List, Optional, Union
+
+import torch
+
+from ....types import (
+ ChatCompletion,
+ ChatCompletionChunk,
+ Completion,
+ CompletionChunk,
+ PytorchGenerateConfig,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import (
+ generate_chat_completion,
+ generate_completion,
+ generate_completion_chunk,
+)
+from .core import PytorchChatModel, PytorchModel
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV2PytorchModel(PytorchModel):
+ def _load_model(self, **kwargs):
+ try:
+ from transformers import (
+ AutoModelForCausalLM,
+ AutoTokenizer,
+ GenerationConfig,
+ )
+ except ImportError:
+ error_message = "Failed to import module 'transformers'"
+ installation_guide = [
+ "Please make sure 'transformers' is installed. ",
+ "You can install it by `pip install transformers`\n",
+ ]
+
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+
+ tokenizer = AutoTokenizer.from_pretrained(
+ self.model_path,
+ trust_remote_code=kwargs["trust_remote_code"],
+ )
+ model = AutoModelForCausalLM.from_pretrained(
+ self.model_path,
+ attn_implementation="eager",
+ torch_dtype=torch.bfloat16,
+ trust_remote_code=True,
+ device_map="auto",
+ )
+ model.generation_config = GenerationConfig.from_pretrained(self.model_path)
+ model.generation_config.pad_token_id = model.generation_config.eos_token_id
+ return model, tokenizer
+
+ @classmethod
+ def match(
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+ ) -> bool:
+ if llm_spec.model_format != "pytorch":
+ return False
+ model_family = llm_family.model_family or llm_family.model_name
+ if "deepseek-v2" not in model_family:
+ return False
+ if "generate" not in llm_family.model_ability:
+ return False
+ return True
+
+ def generate(
+ self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
+ ) -> Union[Completion, Iterator[CompletionChunk]]:
+ input_tensor = self._tokenizer(prompt, return_tensors="pt")
+ generate_config = self._sanitize_generate_config(generate_config)
+ default_generate_config = self._model.generation_config
+ generate_kwargs = {
+ "input_ids": input_tensor["input_ids"].cuda(),
+ "attention_mask": input_tensor["attention_mask"].cuda(),
+ "temperature": float(
+ generate_config.get("temperature", default_generate_config.temperature)
+ ),
+ "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)),
+ "top_p": float(generate_config.get("top_p", default_generate_config.top_p)),
+ "top_k": int(generate_config.get("top_k", -1)),
+ "max_new_tokens": generate_config.get("max_tokens", 512),
+ "bos_token_id": default_generate_config.bos_token_id,
+ "do_sample": default_generate_config.do_sample,
+ "eos_token_id": default_generate_config.eos_token_id,
+ }
+
+ stream = generate_config.get("stream", False)
+ if stream:
+ return self._generate_stream(generate_kwargs, input_tensor)
+ else:
+ return self._generate(generate_kwargs, input_tensor)
+
+ def _generate(self, generate_kwargs, input_ids) -> Completion:
+ prompt_tokens = len(input_ids[0])
+ logger.info(f"generate_kwargs:{generate_kwargs}")
+ generation_output = self._model.generate(**generate_kwargs)
+ completion_tokens = len(generation_output[0])
+ response = self._tokenizer.decode(
+ generation_output[0], skip_special_tokens=True
+ )
+ return generate_completion(
+ self.model_uid,
+ response,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=prompt_tokens + completion_tokens,
+ )
+
+ def _generate_stream(self, generate_kwargs, input_ids):
+ from threading import Thread
+
+ from transformers import TextIteratorStreamer
+
+ # Initialize the streamer
+ streamer = TextIteratorStreamer(
+ self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
+ )
+ # Define the generation configuration
+ generate_kwargs["streamer"] = streamer
+ # Start the model chat in a separate thread
+ thread = Thread(
+ target=self._model.generate,
+ kwargs=generate_kwargs,
+ )
+ thread.start()
+
+ completion_id = str(uuid.uuid1())
+ prompt_tokens = len(input_ids[0])
+ total_tokens, completion_tokens = 0, 0
+ # Loop through the streamer to get the new text as it is generated
+ for i, new_text in enumerate(streamer):
+ completion_tokens = i
+ total_tokens = prompt_tokens + completion_tokens
+ yield generate_completion_chunk(
+ chunk_text=new_text,
+ finish_reason=None,
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=total_tokens,
+ )
+ yield generate_completion_chunk(
+ chunk_text=None,
+ finish_reason="stop",
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=total_tokens,
+ has_choice=True,
+ has_content=False,
+ )
+
+
+class DeepSeekV2PytorchChatModel(PytorchChatModel):
+ def _load_model(self, **kwargs):
+ try:
+ from transformers import (
+ AutoModelForCausalLM,
+ AutoTokenizer,
+ GenerationConfig,
+ )
+ except ImportError:
+ error_message = "Failed to import module 'transformers'"
+ installation_guide = [
+ "Please make sure 'transformers' is installed. ",
+ "You can install it by `pip install transformers`\n",
+ ]
+
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+
+ tokenizer = AutoTokenizer.from_pretrained(
+ self.model_path,
+ trust_remote_code=kwargs["trust_remote_code"],
+ )
+ logger.info(f"kwargs:{kwargs}")
+ model = AutoModelForCausalLM.from_pretrained(
+ self.model_path,
+ attn_implementation="eager",
+ torch_dtype=torch.bfloat16,
+ trust_remote_code=True,
+ device_map="auto",
+ )
+ model.generation_config = GenerationConfig.from_pretrained(self.model_path)
+ model.generation_config.pad_token_id = model.generation_config.eos_token_id
+ return model, tokenizer
+
+ @classmethod
+ def match(
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+ ) -> bool:
+ if llm_spec.model_format != "pytorch":
+ return False
+ model_family = llm_family.model_family or llm_family.model_name
+ if "deepseek-v2" not in model_family:
+ return False
+ if "chat" not in llm_family.model_ability:
+ return False
+ return True
+
+ def chat(
+ self,
+ messages: List[Dict],
+ generate_config: Optional[PytorchGenerateConfig] = None,
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+ assert self.model_family.chat_template is not None
+ full_prompt = self.get_full_context(
+ messages,
+ self.model_family.chat_template,
+ tokenizer=self._tokenizer,
+ )
+ input_tensor = self._tokenizer.encode(
+ full_prompt,
+ padding=False,
+ truncation=False,
+ max_length=None,
+ add_special_tokens=False,
+ return_tensors="pt",
+ )
+
+ generate_config = self._sanitize_generate_config(generate_config)
+ default_generate_config = self._model.generation_config
+ generate_kwargs = {
+ "input_ids": input_tensor.cuda(),
+ "temperature": float(
+ generate_config.get("temperature", default_generate_config.temperature)
+ ),
+ "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)),
+ "top_p": float(generate_config.get("top_p", default_generate_config.top_p)),
+ "top_k": int(generate_config.get("top_k", -1)),
+ "max_new_tokens": generate_config.get("max_tokens", 512),
+ "bos_token_id": default_generate_config.bos_token_id,
+ "do_sample": default_generate_config.do_sample,
+ "eos_token_id": default_generate_config.eos_token_id,
+ }
+
+ stream = generate_config.get("stream", False)
+ stream_options = generate_config.get("stream_options", None)
+ include_usage = (
+ stream_options["include_usage"]
+ if isinstance(stream_options, dict)
+ else False
+ )
+ if stream:
+ chunk = self._generate_stream(generate_kwargs, input_tensor, include_usage)
+ return self._to_chat_completion_chunks(chunk)
+ else:
+ return self._generate(generate_kwargs, input_tensor)
+
+ def _generate(self, generate_kwargs, input_ids) -> ChatCompletion:
+ prompt_tokens = len(input_ids[0])
+ generation_output = self._model.generate(**generate_kwargs)
+ completion_tokens = len(generation_output[0])
+ response = self._tokenizer.decode(
+ generation_output[0][input_ids.shape[1] :], skip_special_tokens=True
+ )
+ return generate_chat_completion(
+ self.model_uid,
+ response,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=prompt_tokens + completion_tokens,
+ )
+
+ def _generate_stream(self, generate_kwargs, input_ids, include_usage):
+ from threading import Thread
+
+ from transformers import TextIteratorStreamer
+
+ # Initialize the streamer
+ streamer = TextIteratorStreamer(
+ self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
+ )
+ # Define the generation configuration
+ generate_kwargs["streamer"] = streamer
+ # Start the model chat in a separate thread
+ thread = Thread(
+ target=self._model.generate,
+ kwargs=generate_kwargs,
+ )
+ thread.start()
+
+ completion_id = str(uuid.uuid1())
+ prompt_tokens = len(input_ids[0])
+ total_tokens, completion_tokens = 0, 0
+ # Loop through the streamer to get the new text as it is generated
+ for i, new_text in enumerate(streamer):
+ completion_tokens = max(completion_tokens, len(streamer.token_cache))
+ total_tokens = prompt_tokens + completion_tokens
+ yield generate_completion_chunk(
+ chunk_text=new_text,
+ finish_reason=None,
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=total_tokens,
+ )
+ yield generate_completion_chunk(
+ chunk_text=None,
+ finish_reason="stop",
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=total_tokens,
+ has_choice=True,
+ has_content=False,
+ )
+
+ if include_usage:
+ yield generate_completion_chunk(
+ chunk_text=None,
+ finish_reason=None,
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=total_tokens,
+ has_choice=False,
+ has_content=False,
+ )
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 0ae802c01c..c5b26027fb 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -549,6 +549,32 @@ def generate_completion_chunk(
)
+def generate_completion(
+ model_uid: str,
+ response: str,
+ prompt_tokens=-1,
+ completion_tokens=-1,
+ total_tokens=-1,
+ finish_reason="stop",
+) -> Completion:
+ return Completion(
+ id=str(uuid.uuid1()),
+ object="text_completion",
+ created=int(time.time()),
+ model=model_uid,
+ choices=[
+ CompletionChoice(
+ text=response, index=0, logprobs=None, finish_reason=finish_reason
+ )
+ ],
+ usage=CompletionUsage(
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ total_tokens=total_tokens,
+ ),
+ )
+
+
def generate_chat_completion(
model_uid: str,
response: str,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 8869f7fb4a..e531769a18 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -149,6 +149,12 @@ class VLLMGenerateConfig(TypedDict, total=False):
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct")
VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")
+if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat")
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
+
+
if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")
From 42745077b24a2b517e565235756c5ff317f98f77 Mon Sep 17 00:00:00 2001
From: Poet <42093310+LaureatePoet@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:52:47 +0800
Subject: [PATCH 02/17] FEAT: Update Qwen2-VL-Model to support
flash_attention_2 implementation (#2289)
Co-authored-by: qinxuye
---
xinference/model/llm/transformers/qwen2_vl.py | 36 ++++++++++++++++---
1 file changed, 31 insertions(+), 5 deletions(-)
diff --git a/xinference/model/llm/transformers/qwen2_vl.py b/xinference/model/llm/transformers/qwen2_vl.py
index 6b27a05139..3eccc0c736 100644
--- a/xinference/model/llm/transformers/qwen2_vl.py
+++ b/xinference/model/llm/transformers/qwen2_vl.py
@@ -11,7 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import importlib.util
import logging
+import sys
import uuid
from typing import Iterator, List, Optional, Union
@@ -59,9 +61,19 @@ def load(self):
self.model_path, trust_remote_code=True
)
self._tokenizer = self._processor.tokenizer
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
- self.model_path, device_map=device, trust_remote_code=True
- ).eval()
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+ if flash_attn_installed:
+ self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+ self.model_path,
+ torch_dtype="bfloat16",
+ device_map=device,
+ attn_implementation="flash_attention_2",
+ trust_remote_code=True,
+ ).eval()
+ else:
+ self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+ self.model_path, device_map=device, trust_remote_code=True
+ ).eval()
def _transform_messages(
self,
@@ -177,8 +189,18 @@ def _generate_stream(
"streamer": streamer,
**inputs,
}
-
- thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
+ error = None
+
+ def model_generate():
+ try:
+ return self._model.generate(**gen_kwargs)
+ except Exception:
+ nonlocal error
+ error = sys.exc_info()
+ streamer.end()
+ raise
+
+ thread = Thread(target=model_generate)
thread.start()
completion_id = str(uuid.uuid1())
@@ -195,6 +217,10 @@ def _generate_stream(
has_content=True,
)
+ if error:
+ _, err, tb = error # type: ignore
+ raise err.with_traceback(tb)
+
yield generate_completion_chunk(
chunk_text=None,
finish_reason="stop",
From 8f73b0550d1a55328fe165c46ada66dee45abf27 Mon Sep 17 00:00:00 2001
From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com>
Date: Fri, 13 Sep 2024 06:02:31 +0200
Subject: [PATCH 03/17] ENH: Support fish speech 1.4 (#2295)
---
setup.cfg | 2 +
xinference/deploy/docker/requirements.txt | 1 +
xinference/deploy/docker/requirements_cpu.txt | 1 +
xinference/model/audio/fish_speech.py | 14 +-
xinference/model/audio/model_spec.json | 6 +-
.../model/audio/tests/test_fish_speech.py | 2 +-
.../fish_speech/configs/firefly_gan_vq.yaml | 5 +-
.../configs/text2semantic_finetune.yaml | 2 +-
.../fish_speech/i18n/locale/en_US.json | 2 +-
.../fish_speech/i18n/locale/es_ES.json | 2 +-
.../fish_speech/i18n/locale/ja_JP.json | 2 +-
.../fish_speech/i18n/locale/pt_BR.json | 2 +-
.../fish_speech/i18n/locale/zh_CN.json | 2 +-
.../fish_speech/models/text2semantic/llama.py | 4 +-
.../fish_speech/models/vqgan/__init__.py | 3 -
.../fish_speech/models/vqgan/lit_module.py | 442 ------------------
.../models/vqgan/modules/discriminator.py | 44 --
.../models/vqgan/modules/firefly.py | 367 +++++++--------
.../fish_speech/models/vqgan/modules/fsq.py | 31 +-
.../models/vqgan/modules/reference.py | 115 -----
.../models/vqgan/modules/wavenet.py | 225 ---------
.../fish_speech/fish_speech/text/clean.py | 56 +--
.../fish_speech/fish_speech/text/spliter.py | 4 +-
.../fish_speech/fish_speech/train.py | 2 +
.../fish_speech/fish_speech/webui/manage.py | 22 +-
.../thirdparty/fish_speech/tools/api.py | 213 ++++-----
.../fish_speech/tools/auto_rerank.py | 159 -------
.../thirdparty/fish_speech/tools/commons.py | 35 ++
.../fish_speech/tools/download_models.py | 6 +-
.../thirdparty/fish_speech/tools/file.py | 17 +
.../thirdparty/fish_speech/tools/gen_ref.py | 36 --
.../fish_speech/tools/llama/build_dataset.py | 2 +-
.../fish_speech/tools/llama/generate.py | 53 ++-
.../fish_speech/tools/llama/merge_lora.py | 2 +-
.../fish_speech/tools/llama/quantize.py | 4 +-
.../fish_speech/tools/merge_asr_files.py | 55 ---
.../fish_speech/tools/msgpack_api.py | 34 ++
.../thirdparty/fish_speech/tools/post_api.py | 129 +++--
.../fish_speech/tools/sensevoice/fun_asr.py | 2 +-
.../thirdparty/fish_speech/tools/smart_pad.py | 19 +-
.../fish_speech/tools/vqgan/extract_vq.py | 4 +-
.../fish_speech/tools/vqgan/inference.py | 6 +-
.../thirdparty/fish_speech/tools/webui.py | 158 +------
43 files changed, 544 insertions(+), 1748 deletions(-)
delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py
delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py
delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py
delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py
delete mode 100644 xinference/thirdparty/fish_speech/tools/auto_rerank.py
create mode 100644 xinference/thirdparty/fish_speech/tools/commons.py
delete mode 100644 xinference/thirdparty/fish_speech/tools/gen_ref.py
delete mode 100644 xinference/thirdparty/fish_speech/tools/merge_asr_files.py
create mode 100644 xinference/thirdparty/fish_speech/tools/msgpack_api.py
diff --git a/setup.cfg b/setup.cfg
index e95ba7ca3a..55f5117c14 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -127,6 +127,7 @@ all =
loguru # For Fish Speech
natsort # For Fish Speech
loralib # For Fish Speech
+ ormsgpack # For Fish Speech
qwen-vl-utils # For qwen2-vl
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
@@ -198,6 +199,7 @@ audio =
loguru # For Fish Speech
natsort # For Fish Speech
loralib # For Fish Speech
+ ormsgpack # For Fish Speech
doc =
ipython>=6.5.0
sphinx>=3.0.0
diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
index b5ac62c254..d23d72c3f9 100644
--- a/xinference/deploy/docker/requirements.txt
+++ b/xinference/deploy/docker/requirements.txt
@@ -70,6 +70,7 @@ jj-pytorchvideo # For CogVLM2-video
loguru # For Fish Speech
natsort # For Fish Speech
loralib # For Fish Speech
+ormsgpack # For Fish Speech
qwen-vl-utils # For qwen2-vl
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
index cb1d27dc44..493f558da2 100644
--- a/xinference/deploy/docker/requirements_cpu.txt
+++ b/xinference/deploy/docker/requirements_cpu.txt
@@ -65,6 +65,7 @@ jj-pytorchvideo # For CogVLM2-video
loguru # For Fish Speech
natsort # For Fish Speech
loralib # For Fish Speech
+ormsgpack # For Fish Speech
qwen-vl-utils # For qwen2-vl
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
diff --git a/xinference/model/audio/fish_speech.py b/xinference/model/audio/fish_speech.py
index 96766a7d27..4a6412f04a 100644
--- a/xinference/model/audio/fish_speech.py
+++ b/xinference/model/audio/fish_speech.py
@@ -92,7 +92,7 @@ def load(self):
checkpoint_path = os.path.join(
self._model_path,
- "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
)
self._model = load_decoder_model(
config_name="firefly_gan_vq",
@@ -213,12 +213,12 @@ def speech(
text=input,
enable_reference_audio=False,
reference_audio=None,
- reference_text="",
- max_new_tokens=0,
- chunk_length=100,
- top_p=0.7,
- repetition_penalty=1.2,
- temperature=0.7,
+ reference_text=kwargs.get("reference_text", ""),
+ max_new_tokens=kwargs.get("max_new_tokens", 1024),
+ chunk_length=kwargs.get("chunk_length", 200),
+ top_p=kwargs.get("top_p", 0.7),
+ repetition_penalty=kwargs.get("repetition_penalty", 1.2),
+ temperature=kwargs.get("temperature", 0.7),
)
)
sample_rate, audio = result[0][1]
diff --git a/xinference/model/audio/model_spec.json b/xinference/model/audio/model_spec.json
index 6d546a0921..6762d84a18 100644
--- a/xinference/model/audio/model_spec.json
+++ b/xinference/model/audio/model_spec.json
@@ -148,10 +148,10 @@
"multilingual": true
},
{
- "model_name": "FishSpeech-1.2-SFT",
+ "model_name": "FishSpeech-1.4",
"model_family": "FishAudio",
- "model_id": "fishaudio/fish-speech-1.2-sft",
- "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0",
+ "model_id": "fishaudio/fish-speech-1.4",
+ "model_revision": "3c49651b8e583b6b13f55e375432e0d57e1aa84d",
"model_ability": "text-to-audio",
"multilingual": true
}
diff --git a/xinference/model/audio/tests/test_fish_speech.py b/xinference/model/audio/tests/test_fish_speech.py
index 8b339290ad..ce57566b19 100644
--- a/xinference/model/audio/tests/test_fish_speech.py
+++ b/xinference/model/audio/tests/test_fish_speech.py
@@ -22,7 +22,7 @@ def test_fish_speech(setup):
client = Client(endpoint)
model_uid = client.launch_model(
- model_name="FishSpeech-1.2-SFT",
+ model_name="FishSpeech-1.4",
model_type="audio",
)
model = client.get_model(model_uid)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml b/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml
index 7417623b03..10aa8d4a52 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml
+++ b/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml
@@ -22,13 +22,12 @@ head:
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
num_mels: 512
upsample_initial_channel: 512
- use_template: false
pre_conv_kernel_size: 13
post_conv_kernel_size: 13
quantizer:
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
input_dim: 512
- n_groups: 4
+ n_groups: 8
n_codebooks: 1
levels: [8, 5, 5, 5]
- downsample_factor: [2]
+ downsample_factor: [2, 2]
diff --git a/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml b/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml
index 1bf8fd6b6d..f4c1993023 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml
+++ b/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml
@@ -4,7 +4,7 @@ defaults:
project: text2semantic_finetune_dual_ar
max_length: 4096
-pretrained_ckpt_path: checkpoints/fish-speech-1.2-sft
+pretrained_ckpt_path: checkpoints/fish-speech-1.4
# Lightning Trainer
trainer:
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json
index cf6ad6ca1e..6e280c236e 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json
@@ -72,7 +72,7 @@
"Put your text here.": "Put your text here.",
"Reference Audio": "Reference Audio",
"Reference Text": "Reference Text",
- "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.",
+ "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
"Remove Selected Data": "Remove Selected Data",
"Removed path successfully!": "Removed path successfully!",
"Repetition Penalty": "Repetition Penalty",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json
index 1ea5988213..3285341f68 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json
@@ -72,7 +72,7 @@
"Put your text here.": "Ponga su texto aquí.",
"Reference Audio": "Audio de Referencia",
"Reference Text": "Texto de Referencia",
- "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado se publica bajo la Licencia BSD-3-Clause, y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
+ "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
"Remove Selected Data": "Eliminar Datos Seleccionados",
"Removed path successfully!": "¡Ruta eliminada exitosamente!",
"Repetition Penalty": "Penalización por Repetición",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json
index e7817eb0c5..d30bac7bcd 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json
@@ -72,7 +72,7 @@
"Put your text here.": "ここにテキストを入力してください。",
"Reference Audio": "リファレンスオーディオ",
"Reference Text": "リファレンステキスト",
- "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "関連コードはBSD-3-Clauseライセンスの下でリリースされ、重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
+ "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
"Remove Selected Data": "選択したデータを削除",
"Removed path successfully!": "パスの削除に成功しました!",
"Repetition Penalty": "反復ペナルティ",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json
index c3df431a40..385f20272e 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json
@@ -84,7 +84,7 @@
"Reference Text": "Texto de Referência",
"warning": "Aviso",
"Pre-processing begins...": "O pré-processamento começou!",
- "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado é licenciado sob a Licença BSD-3-Clause, e os pesos sob a Licença CC BY-NC-SA 4.0.",
+ "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
"Remove Selected Data": "Remover Dados Selecionados",
"Removed path successfully!": "Caminho removido com sucesso!",
"Repetition Penalty": "Penalidade de Repetição",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json
index da81eef1cf..3dd1a5cd1c 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json
+++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json
@@ -72,7 +72,7 @@
"Put your text here.": "在此处输入文本.",
"Reference Audio": "参考音频",
"Reference Text": "参考文本",
- "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "相关代码使用 BSD-3-Clause 许可证发布,权重使用 CC BY-NC-SA 4.0 许可证发布.",
+ "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
"Remove Selected Data": "移除选中数据",
"Removed path successfully!": "移除路径成功!",
"Repetition Penalty": "重复惩罚",
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py b/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py
index 4eef92b0ba..0725dfb9b7 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py
@@ -353,7 +353,7 @@ def from_pretrained(
if "int8" in str(Path(path)):
logger.info("Using int8 weight-only quantization!")
- from ...tools.llama.quantize import WeightOnlyInt8QuantHandler
+ from tools.llama.quantize import WeightOnlyInt8QuantHandler
simple_quantizer = WeightOnlyInt8QuantHandler(model)
model = simple_quantizer.convert_for_runtime()
@@ -363,7 +363,7 @@ def from_pretrained(
path_comps = path.name.split("-")
assert path_comps[-2].startswith("g")
groupsize = int(path_comps[-2][1:])
- from ...tools.llama.quantize import WeightOnlyInt4QuantHandler
+ from tools.llama.quantize import WeightOnlyInt4QuantHandler
simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
model = simple_quantizer.convert_for_runtime()
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py
index 401c6df468..e69de29bb2 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py
@@ -1,3 +0,0 @@
-from .lit_module import VQGAN
-
-__all__ = ["VQGAN"]
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py
deleted file mode 100644
index d5fa2ccabb..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py
+++ /dev/null
@@ -1,442 +0,0 @@
-import itertools
-import math
-from typing import Any, Callable
-
-import lightning as L
-import torch
-import torch.nn.functional as F
-# import wandb
-from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
-from matplotlib import pyplot as plt
-from torch import nn
-
-from fish_speech.models.vqgan.modules.discriminator import Discriminator
-from fish_speech.models.vqgan.modules.wavenet import WaveNet
-from fish_speech.models.vqgan.utils import avg_with_mask, plot_mel, sequence_mask
-
-
-class VQGAN(L.LightningModule):
- def __init__(
- self,
- optimizer: Callable,
- lr_scheduler: Callable,
- encoder: WaveNet,
- quantizer: nn.Module,
- decoder: WaveNet,
- discriminator: Discriminator,
- vocoder: nn.Module,
- encode_mel_transform: nn.Module,
- gt_mel_transform: nn.Module,
- weight_adv: float = 1.0,
- weight_vq: float = 1.0,
- weight_mel: float = 1.0,
- sampling_rate: int = 44100,
- freeze_encoder: bool = False,
- ):
- super().__init__()
-
- # Model parameters
- self.optimizer_builder = optimizer
- self.lr_scheduler_builder = lr_scheduler
-
- # Modules
- self.encoder = encoder
- self.quantizer = quantizer
- self.decoder = decoder
- self.vocoder = vocoder
- self.discriminator = discriminator
- self.encode_mel_transform = encode_mel_transform
- self.gt_mel_transform = gt_mel_transform
-
- # A simple linear layer to project quality to condition channels
- self.quality_projection = nn.Linear(1, 768)
-
- # Freeze vocoder
- for param in self.vocoder.parameters():
- param.requires_grad = False
-
- # Loss weights
- self.weight_adv = weight_adv
- self.weight_vq = weight_vq
- self.weight_mel = weight_mel
-
- # Other parameters
- self.sampling_rate = sampling_rate
-
- # Disable strict loading
- self.strict_loading = False
-
- # If encoder is frozen
- if freeze_encoder:
- for param in self.encoder.parameters():
- param.requires_grad = False
-
- for param in self.quantizer.parameters():
- param.requires_grad = False
-
- self.automatic_optimization = False
-
- def on_save_checkpoint(self, checkpoint):
- # Do not save vocoder
- state_dict = checkpoint["state_dict"]
- for name in list(state_dict.keys()):
- if "vocoder" in name:
- state_dict.pop(name)
-
- def configure_optimizers(self):
- optimizer_generator = self.optimizer_builder(
- itertools.chain(
- self.encoder.parameters(),
- self.quantizer.parameters(),
- self.decoder.parameters(),
- self.quality_projection.parameters(),
- )
- )
- optimizer_discriminator = self.optimizer_builder(
- self.discriminator.parameters()
- )
-
- lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator)
- lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator)
-
- return (
- {
- "optimizer": optimizer_generator,
- "lr_scheduler": {
- "scheduler": lr_scheduler_generator,
- "interval": "step",
- "name": "optimizer/generator",
- },
- },
- {
- "optimizer": optimizer_discriminator,
- "lr_scheduler": {
- "scheduler": lr_scheduler_discriminator,
- "interval": "step",
- "name": "optimizer/discriminator",
- },
- },
- )
-
- def training_step(self, batch, batch_idx):
- optim_g, optim_d = self.optimizers()
-
- audios, audio_lengths = batch["audios"], batch["audio_lengths"]
-
- audios = audios.float()
- audios = audios[:, None, :]
-
- with torch.no_grad():
- encoded_mels = self.encode_mel_transform(audios)
- gt_mels = self.gt_mel_transform(audios)
- quality = ((gt_mels.mean(-1) > -8).sum(-1) - 90) / 10
- quality = quality.unsqueeze(-1)
-
- mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
- mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
- mel_masks_float_conv = mel_masks[:, None, :].float()
- gt_mels = gt_mels * mel_masks_float_conv
- encoded_mels = encoded_mels * mel_masks_float_conv
-
- # Encode
- encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
-
- # Quantize
- vq_result = self.quantizer(encoded_features)
- loss_vq = getattr("vq_result", "loss", 0.0)
- vq_recon_features = vq_result.z * mel_masks_float_conv
- vq_recon_features = (
- vq_recon_features + self.quality_projection(quality)[:, :, None]
- )
-
- # VQ Decode
- gen_mel = (
- self.decoder(
- torch.randn_like(vq_recon_features) * mel_masks_float_conv,
- condition=vq_recon_features,
- )
- * mel_masks_float_conv
- )
-
- # Discriminator
- real_logits = self.discriminator(gt_mels)
- fake_logits = self.discriminator(gen_mel.detach())
- d_mask = F.interpolate(
- mel_masks_float_conv, size=(real_logits.shape[2],), mode="nearest"
- )
-
- loss_real = avg_with_mask((real_logits - 1) ** 2, d_mask)
- loss_fake = avg_with_mask(fake_logits**2, d_mask)
-
- loss_d = loss_real + loss_fake
-
- self.log(
- "train/discriminator/loss",
- loss_d,
- on_step=True,
- on_epoch=False,
- prog_bar=True,
- logger=True,
- )
-
- # Discriminator backward
- optim_d.zero_grad()
- self.manual_backward(loss_d)
- self.clip_gradients(
- optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
- )
- optim_d.step()
-
- # Mel Loss, applying l1, using a weighted sum
- mel_distance = (
- gen_mel - gt_mels
- ).abs() # * 0.5 + self.ssim(gen_mel, gt_mels) * 0.5
- loss_mel_low_freq = avg_with_mask(mel_distance[:, :40, :], mel_masks_float_conv)
- loss_mel_mid_freq = avg_with_mask(
- mel_distance[:, 40:70, :], mel_masks_float_conv
- )
- loss_mel_high_freq = avg_with_mask(
- mel_distance[:, 70:, :], mel_masks_float_conv
- )
- loss_mel = (
- loss_mel_low_freq * 0.6 + loss_mel_mid_freq * 0.3 + loss_mel_high_freq * 0.1
- )
-
- # Adversarial Loss
- fake_logits = self.discriminator(gen_mel)
- loss_adv = avg_with_mask((fake_logits - 1) ** 2, d_mask)
-
- # Total loss
- loss = (
- self.weight_vq * loss_vq
- + self.weight_mel * loss_mel
- + self.weight_adv * loss_adv
- )
-
- # Log losses
- self.log(
- "train/generator/loss",
- loss,
- on_step=True,
- on_epoch=False,
- prog_bar=True,
- logger=True,
- )
- self.log(
- "train/generator/loss_vq",
- loss_vq,
- on_step=True,
- on_epoch=False,
- prog_bar=False,
- logger=True,
- )
- self.log(
- "train/generator/loss_mel",
- loss_mel,
- on_step=True,
- on_epoch=False,
- prog_bar=False,
- logger=True,
- )
- self.log(
- "train/generator/loss_adv",
- loss_adv,
- on_step=True,
- on_epoch=False,
- prog_bar=False,
- logger=True,
- )
-
- # Generator backward
- optim_g.zero_grad()
- self.manual_backward(loss)
- self.clip_gradients(
- optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm"
- )
- optim_g.step()
-
- scheduler_g, scheduler_d = self.lr_schedulers()
- scheduler_g.step()
- scheduler_d.step()
-
- def validation_step(self, batch: Any, batch_idx: int):
- audios, audio_lengths = batch["audios"], batch["audio_lengths"]
-
- audios = audios.float()
- audios = audios[:, None, :]
-
- encoded_mels = self.encode_mel_transform(audios)
- gt_mels = self.gt_mel_transform(audios)
-
- mel_lengths = audio_lengths // self.gt_mel_transform.hop_length
- mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2])
- mel_masks_float_conv = mel_masks[:, None, :].float()
- gt_mels = gt_mels * mel_masks_float_conv
- encoded_mels = encoded_mels * mel_masks_float_conv
-
- # Encode
- encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv
-
- # Quantize
- vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv
- vq_recon_features = (
- vq_recon_features
- + self.quality_projection(
- torch.ones(
- vq_recon_features.shape[0], 1, device=vq_recon_features.device
- )
- * 2
- )[:, :, None]
- )
-
- # VQ Decode
- gen_aux_mels = (
- self.decoder(
- torch.randn_like(vq_recon_features) * mel_masks_float_conv,
- condition=vq_recon_features,
- )
- * mel_masks_float_conv
- )
- loss_mel = avg_with_mask((gen_aux_mels - gt_mels).abs(), mel_masks_float_conv)
-
- self.log(
- "val/loss_mel",
- loss_mel,
- on_step=False,
- on_epoch=True,
- prog_bar=False,
- logger=True,
- sync_dist=True,
- )
-
- recon_audios = self.vocoder(gt_mels)
- gen_aux_audios = self.vocoder(gen_aux_mels)
-
- # only log the first batch
- if batch_idx != 0:
- return
-
- for idx, (
- gt_mel,
- gen_aux_mel,
- audio,
- gen_aux_audio,
- recon_audio,
- audio_len,
- ) in enumerate(
- zip(
- gt_mels,
- gen_aux_mels,
- audios.cpu().float(),
- gen_aux_audios.cpu().float(),
- recon_audios.cpu().float(),
- audio_lengths,
- )
- ):
- if idx > 4:
- break
-
- mel_len = audio_len // self.gt_mel_transform.hop_length
-
- image_mels = plot_mel(
- [
- gt_mel[:, :mel_len],
- gen_aux_mel[:, :mel_len],
- ],
- [
- "Ground-Truth",
- "Auxiliary",
- ],
- )
-
- if isinstance(self.logger, WandbLogger):
- self.logger.experiment.log(
- {
- "reconstruction_mel": wandb.Image(image_mels, caption="mels"),
- "wavs": [
- wandb.Audio(
- audio[0, :audio_len],
- sample_rate=self.sampling_rate,
- caption="gt",
- ),
- wandb.Audio(
- gen_aux_audio[0, :audio_len],
- sample_rate=self.sampling_rate,
- caption="aux",
- ),
- wandb.Audio(
- recon_audio[0, :audio_len],
- sample_rate=self.sampling_rate,
- caption="recon",
- ),
- ],
- },
- )
-
- if isinstance(self.logger, TensorBoardLogger):
- self.logger.experiment.add_figure(
- f"sample-{idx}/mels",
- image_mels,
- global_step=self.global_step,
- )
- self.logger.experiment.add_audio(
- f"sample-{idx}/wavs/gt",
- audio[0, :audio_len],
- self.global_step,
- sample_rate=self.sampling_rate,
- )
- self.logger.experiment.add_audio(
- f"sample-{idx}/wavs/gen",
- gen_aux_audio[0, :audio_len],
- self.global_step,
- sample_rate=self.sampling_rate,
- )
- self.logger.experiment.add_audio(
- f"sample-{idx}/wavs/recon",
- recon_audio[0, :audio_len],
- self.global_step,
- sample_rate=self.sampling_rate,
- )
-
- plt.close(image_mels)
-
- def encode(self, audios, audio_lengths):
- audios = audios.float()
-
- mels = self.encode_mel_transform(audios)
- mel_lengths = audio_lengths // self.encode_mel_transform.hop_length
- mel_masks = sequence_mask(mel_lengths, mels.shape[2])
- mel_masks_float_conv = mel_masks[:, None, :].float()
- mels = mels * mel_masks_float_conv
-
- # Encode
- encoded_features = self.encoder(mels) * mel_masks_float_conv
- feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
-
- return self.quantizer.encode(encoded_features), feature_lengths
-
- def decode(self, indices, feature_lengths, return_audios=False):
- factor = math.prod(self.quantizer.downsample_factor)
- mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
- mel_masks_float_conv = mel_masks[:, None, :].float()
-
- z = self.quantizer.decode(indices) * mel_masks_float_conv
- z = (
- z
- + self.quality_projection(torch.ones(z.shape[0], 1, device=z.device) * 2)[
- :, :, None
- ]
- )
-
- gen_mel = (
- self.decoder(
- torch.randn_like(z) * mel_masks_float_conv,
- condition=z,
- )
- * mel_masks_float_conv
- )
-
- if return_audios:
- return self.vocoder(gen_mel)
-
- return gen_mel
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py
deleted file mode 100644
index 69c7df4103..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from torch import nn
-from torch.nn.utils.parametrizations import weight_norm
-
-
-class Discriminator(nn.Module):
- def __init__(self):
- super().__init__()
-
- blocks = []
- convs = [
- (1, 64, (3, 9), 1, (1, 4)),
- (64, 128, (3, 9), (1, 2), (1, 4)),
- (128, 256, (3, 9), (1, 2), (1, 4)),
- (256, 512, (3, 9), (1, 2), (1, 4)),
- (512, 1024, (3, 3), 1, (1, 1)),
- (1024, 1, (3, 3), 1, (1, 1)),
- ]
-
- for idx, (in_channels, out_channels, kernel_size, stride, padding) in enumerate(
- convs
- ):
- blocks.append(
- weight_norm(
- nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
- )
- )
-
- if idx != len(convs) - 1:
- blocks.append(nn.SiLU(inplace=True))
-
- self.blocks = nn.Sequential(*blocks)
-
- def forward(self, x):
- return self.blocks(x[:, None])[:, 0]
-
-
-if __name__ == "__main__":
- model = Discriminator()
- print(sum(p.numel() for p in model.parameters()) / 1_000_000)
- x = torch.randn(1, 128, 1024)
- y = model(x)
- print(y.shape)
- print(y)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py
index 4ca0ff5882..aa21839b54 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py
@@ -1,25 +1,26 @@
-# A inference only version of the FireflyGAN model
-
import math
from functools import partial
from math import prod
from typing import Callable
-import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
-from torch.nn import Conv1d
from torch.nn.utils.parametrizations import weight_norm
from torch.nn.utils.parametrize import remove_parametrizations
from torch.utils.checkpoint import checkpoint
-from fish_speech.models.vqgan.utils import sequence_mask
+
+def sequence_mask(length, max_length=None):
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
- if classname.find("Conv") != -1:
+ if classname.find("Conv1D") != -1:
m.weight.data.normal_(mean, std)
@@ -27,78 +28,141 @@ def get_padding(kernel_size, dilation=1):
return (kernel_size * dilation - dilation) // 2
+def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
+ """Remove padding from x, handling properly zero padding. Only for 1d!"""
+ padding_left, padding_right = paddings
+ assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+ assert (padding_left + padding_right) <= x.shape[-1]
+ end = x.shape[-1] - padding_right
+ return x[..., padding_left:end]
+
+
+def get_extra_padding_for_conv1d(
+ x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+ """See `pad_for_conv1d`."""
+ length = x.shape[-1]
+ n_frames = (length - kernel_size + padding_total) / stride + 1
+ ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+ return ideal_length - length
+
+
+def pad1d(
+ x: torch.Tensor,
+ paddings: tuple[int, int],
+ mode: str = "zeros",
+ value: float = 0.0,
+):
+ """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+ If this is the case, we insert extra 0 padding to the right
+ before the reflection happen.
+ """
+ length = x.shape[-1]
+ padding_left, padding_right = paddings
+ assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+ if mode == "reflect":
+ max_pad = max(padding_left, padding_right)
+ extra_pad = 0
+ if length <= max_pad:
+ extra_pad = max_pad - length + 1
+ x = F.pad(x, (0, extra_pad))
+ padded = F.pad(x, paddings, mode, value)
+ end = padded.shape[-1] - extra_pad
+ return padded[..., :end]
+ else:
+ return F.pad(x, paddings, mode, value)
+
+
+class FishConvNet(nn.Module):
+ def __init__(
+ self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1
+ ):
+ super(FishConvNet, self).__init__()
+ self.conv = nn.Conv1d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ dilation=dilation,
+ groups=groups,
+ )
+ self.stride = stride
+ self.kernel_size = (kernel_size - 1) * dilation + 1
+ self.dilation = dilation
+
+ def forward(self, x):
+ pad = self.kernel_size - self.stride
+ extra_padding = get_extra_padding_for_conv1d(
+ x, self.kernel_size, self.stride, pad
+ )
+ x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
+ return self.conv(x).contiguous()
+
+ def weight_norm(self, name="weight", dim=0):
+ self.conv = weight_norm(self.conv, name=name, dim=dim)
+ return self
+
+ def remove_weight_norm(self):
+ self.conv = remove_parametrizations(self.conv)
+ return self
+
+
+class FishTransConvNet(nn.Module):
+ def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
+ super(FishTransConvNet, self).__init__()
+ self.conv = nn.ConvTranspose1d(
+ in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
+ )
+ self.stride = stride
+ self.kernel_size = kernel_size
+
+ def forward(self, x):
+ x = self.conv(x)
+ pad = self.kernel_size - self.stride
+ padding_right = math.ceil(pad)
+ padding_left = pad - padding_right
+ x = unpad1d(x, (padding_left, padding_right))
+ return x.contiguous()
+
+ def weight_norm(self, name="weight", dim=0):
+ self.conv = weight_norm(self.conv, name=name, dim=dim)
+ return self
+
+ def remove_weight_norm(self):
+ self.conv = remove_parametrizations(self.conv)
+ return self
+
+
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super().__init__()
self.convs1 = nn.ModuleList(
[
- weight_norm(
- Conv1d(
- channels,
- channels,
- kernel_size,
- 1,
- dilation=dilation[0],
- padding=get_padding(kernel_size, dilation[0]),
- )
- ),
- weight_norm(
- Conv1d(
- channels,
- channels,
- kernel_size,
- 1,
- dilation=dilation[1],
- padding=get_padding(kernel_size, dilation[1]),
- )
- ),
- weight_norm(
- Conv1d(
- channels,
- channels,
- kernel_size,
- 1,
- dilation=dilation[2],
- padding=get_padding(kernel_size, dilation[2]),
- )
- ),
+ FishConvNet(
+ channels, channels, kernel_size, stride=1, dilation=dilation[0]
+ ).weight_norm(),
+ FishConvNet(
+ channels, channels, kernel_size, stride=1, dilation=dilation[1]
+ ).weight_norm(),
+ FishConvNet(
+ channels, channels, kernel_size, stride=1, dilation=dilation[2]
+ ).weight_norm(),
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
- weight_norm(
- Conv1d(
- channels,
- channels,
- kernel_size,
- 1,
- dilation=1,
- padding=get_padding(kernel_size, 1),
- )
- ),
- weight_norm(
- Conv1d(
- channels,
- channels,
- kernel_size,
- 1,
- dilation=1,
- padding=get_padding(kernel_size, 1),
- )
- ),
- weight_norm(
- Conv1d(
- channels,
- channels,
- kernel_size,
- 1,
- dilation=1,
- padding=get_padding(kernel_size, 1),
- )
- ),
+ FishConvNet(
+ channels, channels, kernel_size, stride=1, dilation=dilation[0]
+ ).weight_norm(),
+ FishConvNet(
+ channels, channels, kernel_size, stride=1, dilation=dilation[1]
+ ).weight_norm(),
+ FishConvNet(
+ channels, channels, kernel_size, stride=1, dilation=dilation[2]
+ ).weight_norm(),
]
)
self.convs2.apply(init_weights)
@@ -119,7 +183,7 @@ def remove_parametrizations(self):
remove_parametrizations(conv, tensor_name="weight")
-class ParralelBlock(nn.Module):
+class ParallelBlock(nn.Module):
def __init__(
self,
channels: int,
@@ -153,7 +217,6 @@ def __init__(
resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
num_mels: int = 128,
upsample_initial_channel: int = 512,
- use_template: bool = True,
pre_conv_kernel_size: int = 7,
post_conv_kernel_size: int = 7,
post_activation: Callable = partial(nn.SiLU, inplace=True),
@@ -164,85 +227,51 @@ def __init__(
prod(upsample_rates) == hop_length
), f"hop_length must be {prod(upsample_rates)}"
- self.conv_pre = weight_norm(
- nn.Conv1d(
- num_mels,
- upsample_initial_channel,
- pre_conv_kernel_size,
- 1,
- padding=get_padding(pre_conv_kernel_size),
- )
- )
+ self.conv_pre = FishConvNet(
+ num_mels,
+ upsample_initial_channel,
+ pre_conv_kernel_size,
+ stride=1,
+ ).weight_norm()
self.num_upsamples = len(upsample_rates)
self.num_kernels = len(resblock_kernel_sizes)
self.noise_convs = nn.ModuleList()
- self.use_template = use_template
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
- c_cur = upsample_initial_channel // (2 ** (i + 1))
self.ups.append(
- weight_norm(
- nn.ConvTranspose1d(
- upsample_initial_channel // (2**i),
- upsample_initial_channel // (2 ** (i + 1)),
- k,
- u,
- padding=(k - u) // 2,
- )
- )
+ FishTransConvNet(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ stride=u,
+ ).weight_norm()
)
- if not use_template:
- continue
-
- if i + 1 < len(upsample_rates):
- stride_f0 = np.prod(upsample_rates[i + 1 :])
- self.noise_convs.append(
- Conv1d(
- 1,
- c_cur,
- kernel_size=stride_f0 * 2,
- stride=stride_f0,
- padding=stride_f0 // 2,
- )
- )
- else:
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
-
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
self.resblocks.append(
- ParralelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
+ ParallelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
)
self.activation_post = post_activation()
- self.conv_post = weight_norm(
- nn.Conv1d(
- ch,
- 1,
- post_conv_kernel_size,
- 1,
- padding=get_padding(post_conv_kernel_size),
- )
- )
+ self.conv_post = FishConvNet(
+ ch, 1, post_conv_kernel_size, stride=1
+ ).weight_norm()
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
- def forward(self, x, template=None):
+ def forward(self, x):
x = self.conv_pre(x)
for i in range(self.num_upsamples):
x = F.silu(x, inplace=True)
x = self.ups[i](x)
- if self.use_template:
- x = x + self.noise_convs[i](template)
-
- if self.training:
+ if self.training and self.checkpointing:
x = checkpoint(
self.resblocks[i],
x,
@@ -364,11 +393,11 @@ def __init__(
):
super().__init__()
- self.dwconv = nn.Conv1d(
+ self.dwconv = FishConvNet(
dim,
dim,
kernel_size=kernel_size,
- padding=int(dilation * (kernel_size - 1) / 2),
+ # padding=int(dilation * (kernel_size - 1) / 2),
groups=dim,
) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
@@ -421,12 +450,13 @@ def __init__(
self.downsample_layers = nn.ModuleList()
stem = nn.Sequential(
- nn.Conv1d(
+ FishConvNet(
input_channels,
dims[0],
- kernel_size=kernel_size,
- padding=kernel_size // 2,
- padding_mode="zeros",
+ kernel_size=7,
+ # padding=3,
+ # padding_mode="replicate",
+ # padding_mode="zeros",
),
LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
)
@@ -491,6 +521,7 @@ def __init__(
self.head = head
self.quantizer = quantizer
self.spec_transform = spec_transform
+ self.downsample_factor = math.prod(self.quantizer.downsample_factor)
def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
if self.spec_transform is not None:
@@ -512,7 +543,7 @@ def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
if x.ndim == 2:
x = x[:, None, :]
- if self.quantizer is not None:
+ if self.vq is not None:
return x, vq_result
return x
@@ -528,25 +559,30 @@ def encode(self, audios, audio_lengths):
# Encode
encoded_features = self.backbone(mels) * mel_masks_float_conv
- feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
+ feature_lengths = mel_lengths // self.downsample_factor
return self.quantizer.encode(encoded_features), feature_lengths
def decode(self, indices, feature_lengths) -> torch.Tensor:
- factor = math.prod(self.quantizer.downsample_factor)
- mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
+ mel_masks = sequence_mask(
+ feature_lengths * self.downsample_factor,
+ indices.shape[2] * self.downsample_factor,
+ )
mel_masks_float_conv = mel_masks[:, None, :].float()
+ audio_lengths = (
+ feature_lengths * self.downsample_factor * self.spec_transform.hop_length
+ )
audio_masks = sequence_mask(
- feature_lengths * factor * self.spec_transform.hop_length,
- indices.shape[2] * factor * self.spec_transform.hop_length,
+ audio_lengths,
+ indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length,
)
audio_masks_float_conv = audio_masks[:, None, :].float()
z = self.quantizer.decode(indices) * mel_masks_float_conv
x = self.head(z) * audio_masks_float_conv
- return x
+ return x, audio_lengths
def remove_parametrizations(self):
if hasattr(self.backbone, "remove_parametrizations"):
@@ -558,68 +594,3 @@ def remove_parametrizations(self):
@property
def device(self):
return next(self.parameters()).device
-
-
-class FireflyBase(nn.Module):
- def __init__(self, ckpt_path: str = None, pretrained: bool = True):
- super().__init__()
-
- self.backbone = ConvNeXtEncoder(
- input_channels=128,
- depths=[3, 3, 9, 3],
- dims=[128, 256, 384, 512],
- drop_path_rate=0.2,
- kernel_size=7,
- )
-
- self.head = HiFiGANGenerator(
- hop_length=512,
- upsample_rates=[8, 8, 2, 2, 2],
- upsample_kernel_sizes=[16, 16, 4, 4, 4],
- resblock_kernel_sizes=[3, 7, 11],
- resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- num_mels=512,
- upsample_initial_channel=512,
- use_template=False,
- pre_conv_kernel_size=13,
- post_conv_kernel_size=13,
- )
-
- if ckpt_path is not None:
- state_dict = torch.load(ckpt_path, map_location="cpu")
- elif pretrained:
- state_dict = torch.hub.load_state_dict_from_url(
- "https://github.com/fishaudio/vocoder/releases/download/1.0.0/firefly-gan-base-generator.ckpt",
- map_location="cpu",
- model_dir="checkpoints",
- )
-
- if "state_dict" in state_dict:
- state_dict = state_dict["state_dict"]
-
- if any("generator." in k for k in state_dict):
- state_dict = {
- k.replace("generator.", ""): v
- for k, v in state_dict.items()
- if "generator." in k
- }
-
- self.load_state_dict(state_dict, strict=True)
- self.head.remove_parametrizations()
-
- @torch.no_grad()
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- x = self.backbone(x)
- x = self.head(x)
- if x.ndim == 2:
- x = x[:, None, :]
- return x
-
-
-if __name__ == "__main__":
- model = FireflyBase()
- model.eval()
- x = torch.randn(1, 128, 128)
- with torch.no_grad():
- y = model(x)
- print(y.shape)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py
index c837d6aee5..7ea4853376 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py
@@ -6,7 +6,7 @@
from einops import rearrange
from vector_quantize_pytorch import GroupedResidualFSQ
-from .firefly import ConvNeXtBlock
+from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet
@dataclass
@@ -20,7 +20,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
def __init__(
self,
input_dim: int = 512,
- n_codebooks: int = 1,
+ n_codebooks: int = 9,
n_groups: int = 1,
levels: tuple[int] = (8, 5, 5, 5), # Approximate 2**10
downsample_factor: tuple[int] = (2, 2),
@@ -46,7 +46,7 @@ def __init__(
self.downsample = nn.Sequential(
*[
nn.Sequential(
- nn.Conv1d(
+ FishConvNet(
all_dims[idx],
all_dims[idx + 1],
kernel_size=factor,
@@ -61,7 +61,7 @@ def __init__(
self.upsample = nn.Sequential(
*[
nn.Sequential(
- nn.ConvTranspose1d(
+ FishTransConvNet(
all_dims[idx + 1],
all_dims[idx],
kernel_size=factor,
@@ -114,26 +114,3 @@ def decode(self, indices: torch.Tensor):
z_q = self.residual_fsq.get_output_from_indices(indices)
z_q = self.upsample(z_q.mT)
return z_q
-
- # def from_latents(self, latents: torch.Tensor):
- # z_q, z_p, codes = super().from_latents(latents)
- # z_q = self.upsample(z_q)
- # return z_q, z_p, codes
-
-
-if __name__ == "__main__":
- rvq = DownsampleFiniteScalarQuantize(
- n_codebooks=1,
- downsample_factor=(2, 2),
- )
- x = torch.randn(16, 512, 80)
-
- result = rvq(x)
- print(rvq)
- print(result.latents.shape, result.codes.shape, result.z.shape)
-
- # y = rvq.from_codes(result.codes)
- # print(y[0].shape)
-
- # y = rvq.from_latents(result.latents)
- # print(y[0].shape)
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py
deleted file mode 100644
index 0d9c8c8359..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from fish_speech.utils import autocast_exclude_mps
-
-from .wavenet import WaveNet
-
-
-class ReferenceEncoder(WaveNet):
- def __init__(
- self,
- input_channels: Optional[int] = None,
- output_channels: Optional[int] = None,
- residual_channels: int = 512,
- residual_layers: int = 20,
- dilation_cycle: Optional[int] = 4,
- num_heads: int = 8,
- latent_len: int = 4,
- ):
- super().__init__(
- input_channels=input_channels,
- residual_channels=residual_channels,
- residual_layers=residual_layers,
- dilation_cycle=dilation_cycle,
- )
-
- self.head_dim = residual_channels // num_heads
- self.num_heads = num_heads
-
- self.latent_len = latent_len
- self.latent = nn.Parameter(torch.zeros(1, self.latent_len, residual_channels))
-
- self.q = nn.Linear(residual_channels, residual_channels, bias=True)
- self.kv = nn.Linear(residual_channels, residual_channels * 2, bias=True)
- self.q_norm = nn.LayerNorm(self.head_dim)
- self.k_norm = nn.LayerNorm(self.head_dim)
- self.proj = nn.Linear(residual_channels, residual_channels)
- self.proj_drop = nn.Dropout(0.1)
-
- self.norm = nn.LayerNorm(residual_channels)
- self.mlp = nn.Sequential(
- nn.Linear(residual_channels, residual_channels * 4),
- nn.SiLU(),
- nn.Linear(residual_channels * 4, residual_channels),
- )
- self.output_projection_attn = nn.Linear(residual_channels, output_channels)
-
- torch.nn.init.trunc_normal_(self.latent, std=0.02)
- self.apply(self.init_weights)
-
- def init_weights(self, m):
- if isinstance(m, nn.Linear):
- torch.nn.init.trunc_normal_(m.weight, std=0.02)
- if m.bias is not None:
- torch.nn.init.constant_(m.bias, 0)
-
- def forward(self, x, attn_mask=None):
- x = super().forward(x).mT
- B, N, C = x.shape
-
- # Calculate mask
- if attn_mask is not None:
- assert attn_mask.shape == (B, N) and attn_mask.dtype == torch.bool
-
- attn_mask = attn_mask[:, None, None, :].expand(
- B, self.num_heads, self.latent_len, N
- )
-
- q_latent = self.latent.expand(B, -1, -1)
- q = (
- self.q(q_latent)
- .reshape(B, self.latent_len, self.num_heads, self.head_dim)
- .transpose(1, 2)
- )
-
- kv = (
- self.kv(x)
- .reshape(B, N, 2, self.num_heads, self.head_dim)
- .permute(2, 0, 3, 1, 4)
- )
- k, v = kv.unbind(0)
-
- q, k = self.q_norm(q), self.k_norm(k)
- x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
-
- x = x.transpose(1, 2).reshape(B, self.latent_len, C)
- x = self.proj(x)
- x = self.proj_drop(x)
-
- x = x + self.mlp(self.norm(x))
- x = self.output_projection_attn(x)
- x = x.mean(1)
-
- return x
-
-
-if __name__ == "__main__":
- with autocast_exclude_mps(device_type="cpu", dtype=torch.bfloat16):
- model = ReferenceEncoder(
- input_channels=128,
- output_channels=64,
- residual_channels=384,
- residual_layers=20,
- dilation_cycle=4,
- num_heads=8,
- )
- x = torch.randn(4, 128, 64)
- mask = torch.ones(4, 64, dtype=torch.bool)
- y = model(x, mask)
- print(y.shape)
- loss = F.mse_loss(y, torch.randn(4, 64))
- loss.backward()
diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py
deleted file mode 100644
index e7cc011c3e..0000000000
--- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import math
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-
-class Mish(nn.Module):
- def forward(self, x):
- return x * torch.tanh(F.softplus(x))
-
-
-class DiffusionEmbedding(nn.Module):
- """Diffusion Step Embedding"""
-
- def __init__(self, d_denoiser):
- super(DiffusionEmbedding, self).__init__()
- self.dim = d_denoiser
-
- def forward(self, x):
- device = x.device
- half_dim = self.dim // 2
- emb = math.log(10000) / (half_dim - 1)
- emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
- emb = x[:, None] * emb[None, :]
- emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
- return emb
-
-
-class LinearNorm(nn.Module):
- """LinearNorm Projection"""
-
- def __init__(self, in_features, out_features, bias=False):
- super(LinearNorm, self).__init__()
- self.linear = nn.Linear(in_features, out_features, bias)
-
- nn.init.xavier_uniform_(self.linear.weight)
- if bias:
- nn.init.constant_(self.linear.bias, 0.0)
-
- def forward(self, x):
- x = self.linear(x)
- return x
-
-
-class ConvNorm(nn.Module):
- """1D Convolution"""
-
- def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size=1,
- stride=1,
- padding=None,
- dilation=1,
- bias=True,
- w_init_gain="linear",
- ):
- super(ConvNorm, self).__init__()
-
- if padding is None:
- assert kernel_size % 2 == 1
- padding = int(dilation * (kernel_size - 1) / 2)
-
- self.conv = nn.Conv1d(
- in_channels,
- out_channels,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias,
- )
- nn.init.kaiming_normal_(self.conv.weight)
-
- def forward(self, signal):
- conv_signal = self.conv(signal)
-
- return conv_signal
-
-
-class ResidualBlock(nn.Module):
- """Residual Block"""
-
- def __init__(
- self,
- residual_channels,
- use_linear_bias=False,
- dilation=1,
- condition_channels=None,
- ):
- super(ResidualBlock, self).__init__()
- self.conv_layer = ConvNorm(
- residual_channels,
- 2 * residual_channels,
- kernel_size=3,
- stride=1,
- padding=dilation,
- dilation=dilation,
- )
-
- if condition_channels is not None:
- self.diffusion_projection = LinearNorm(
- residual_channels, residual_channels, use_linear_bias
- )
- self.condition_projection = ConvNorm(
- condition_channels, 2 * residual_channels, kernel_size=1
- )
-
- self.output_projection = ConvNorm(
- residual_channels, 2 * residual_channels, kernel_size=1
- )
-
- def forward(self, x, condition=None, diffusion_step=None):
- y = x
-
- if diffusion_step is not None:
- diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
- y = y + diffusion_step
-
- y = self.conv_layer(y)
-
- if condition is not None:
- condition = self.condition_projection(condition)
- y = y + condition
-
- gate, filter = torch.chunk(y, 2, dim=1)
- y = torch.sigmoid(gate) * torch.tanh(filter)
-
- y = self.output_projection(y)
- residual, skip = torch.chunk(y, 2, dim=1)
-
- return (x + residual) / math.sqrt(2.0), skip
-
-
-class WaveNet(nn.Module):
- def __init__(
- self,
- input_channels: Optional[int] = None,
- output_channels: Optional[int] = None,
- residual_channels: int = 512,
- residual_layers: int = 20,
- dilation_cycle: Optional[int] = 4,
- is_diffusion: bool = False,
- condition_channels: Optional[int] = None,
- ):
- super().__init__()
-
- # Input projection
- self.input_projection = None
- if input_channels is not None and input_channels != residual_channels:
- self.input_projection = ConvNorm(
- input_channels, residual_channels, kernel_size=1
- )
-
- if input_channels is None:
- input_channels = residual_channels
-
- self.input_channels = input_channels
-
- # Residual layers
- self.residual_layers = nn.ModuleList(
- [
- ResidualBlock(
- residual_channels=residual_channels,
- use_linear_bias=False,
- dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
- condition_channels=condition_channels,
- )
- for i in range(residual_layers)
- ]
- )
-
- # Skip projection
- self.skip_projection = ConvNorm(
- residual_channels, residual_channels, kernel_size=1
- )
-
- # Output projection
- self.output_projection = None
- if output_channels is not None and output_channels != residual_channels:
- self.output_projection = ConvNorm(
- residual_channels, output_channels, kernel_size=1
- )
-
- if is_diffusion:
- self.diffusion_embedding = DiffusionEmbedding(residual_channels)
- self.mlp = nn.Sequential(
- LinearNorm(residual_channels, residual_channels * 4, False),
- Mish(),
- LinearNorm(residual_channels * 4, residual_channels, False),
- )
-
- self.apply(self._init_weights)
-
- def _init_weights(self, m):
- if isinstance(m, (nn.Conv1d, nn.Linear)):
- nn.init.trunc_normal_(m.weight, std=0.02)
- if getattr(m, "bias", None) is not None:
- nn.init.constant_(m.bias, 0)
-
- def forward(self, x, t=None, condition=None):
- if self.input_projection is not None:
- x = self.input_projection(x)
- x = F.silu(x)
-
- if t is not None:
- t = self.diffusion_embedding(t)
- t = self.mlp(t)
-
- skip = []
- for layer in self.residual_layers:
- x, skip_connection = layer(x, condition, t)
- skip.append(skip_connection)
-
- x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
- x = self.skip_projection(x)
-
- if self.output_projection is not None:
- x = F.silu(x)
- x = self.output_projection(x)
-
- return x
diff --git a/xinference/thirdparty/fish_speech/fish_speech/text/clean.py b/xinference/thirdparty/fish_speech/fish_speech/text/clean.py
index 76d9dc9033..c228dfcd13 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/text/clean.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/text/clean.py
@@ -1,61 +1,24 @@
-import itertools
import re
-LANGUAGE_UNICODE_RANGE_MAP = {
- "ZH": [(0x4E00, 0x9FFF)],
- "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
- "EN": [(0x0000, 0x007F)],
-}
-
SYMBOLS_MAPPING = {
- ":": ",",
- ";": ",",
- ",": ",",
- "。": ".",
- "!": "!",
- "?": "?",
- "\n": ".",
- "·": ",",
- "、": ",",
- "...": "…",
"“": "'",
"”": "'",
"‘": "'",
"’": "'",
- "(": "'",
- ")": "'",
- "(": "'",
- ")": "'",
- "《": "'",
- "》": "'",
- "【": "'",
- "】": "'",
- "[": "'",
- "]": "'",
- "—": "-",
- "~": "-",
- "~": "-",
- "・": "-",
- "「": "'",
- "」": "'",
- ";": ",",
- ":": ",",
+ "【": "",
+ "】": "",
+ "[": "",
+ "]": "",
+ "(": "",
+ ")": "",
+ "(": "",
+ ")": "",
+ "・": "·",
}
REPLACE_SYMBOL_REGEX = re.compile(
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
)
-ALL_KNOWN_UTF8_RANGE = list(
- itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
-)
-REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
- "[^"
- + "".join(
- f"{re.escape(chr(start))}-{re.escape(chr(end))}"
- for start, end in ALL_KNOWN_UTF8_RANGE
- )
- + "]"
-)
def clean_text(text):
@@ -64,6 +27,5 @@ def clean_text(text):
# Replace all chinese symbols with their english counterparts
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
- text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
return text
diff --git a/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py b/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py
index 5528cd3a63..d4bb995487 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py
@@ -71,9 +71,9 @@ def split_text(text, length):
texts = [text]
texts = map(protect_float, texts)
- texts = break_text(texts, length, {".", "!", "?"})
+ texts = break_text(texts, length, {".", "!", "?", "。", "!", "?"})
texts = map(unprotect_float, texts)
- texts = break_text(texts, length, {","})
+ texts = break_text(texts, length, {",", ","})
texts = break_text(texts, length, {" "})
texts = list(break_text_by_length(texts, length))
diff --git a/xinference/thirdparty/fish_speech/fish_speech/train.py b/xinference/thirdparty/fish_speech/fish_speech/train.py
index a6a344097a..41b3642f88 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/train.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/train.py
@@ -1,4 +1,6 @@
import os
+
+os.environ["USE_LIBUV"] = "0"
import sys
from typing import Optional
diff --git a/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py b/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py
index 9c183acd7c..4ec3fcac25 100644
--- a/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py
+++ b/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py
@@ -1,9 +1,11 @@
from __future__ import annotations
+import os
+
+os.environ["USE_LIBUV"] = "0"
import datetime
import html
import json
-import os
import platform
import shutil
import signal
@@ -469,7 +471,7 @@ def train_process(
"--config-name",
"firefly_gan_vq",
"--checkpoint-path",
- "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
]
)
@@ -485,7 +487,7 @@ def train_process(
"16",
]
)
- ckpt_path = "checkpoints/fish-speech-1.2-sft/model.pth"
+ ckpt_path = "checkpoints/fish-speech-1.4/model.pth"
lora_prefix = "lora_" if llama_use_lora else ""
llama_name = lora_prefix + "text2semantic_" + new_project
latest = next(
@@ -862,7 +864,7 @@ def llama_quantify(llama_weight, quantify_mode):
minimum=1,
maximum=32,
step=1,
- value=4,
+ value=2,
)
llama_data_max_length_slider = gr.Slider(
label=i18n("Maximum Length per Sample"),
@@ -870,7 +872,7 @@ def llama_quantify(llama_weight, quantify_mode):
minimum=1024,
maximum=4096,
step=128,
- value=1024,
+ value=2048,
)
with gr.Row(equal_height=False):
llama_precision_dropdown = gr.Dropdown(
@@ -925,9 +927,9 @@ def llama_quantify(llama_weight, quantify_mode):
"Type the path or select from the dropdown"
),
choices=[
- "checkpoints/fish-speech-1.2-sft/model.pth",
+ "checkpoints/fish-speech-1.4/model.pth",
],
- value="checkpoints/fish-speech-1.2-sft/model.pth",
+ value="checkpoints/fish-speech-1.4/model.pth",
allow_custom_value=True,
interactive=True,
)
@@ -979,7 +981,7 @@ def llama_quantify(llama_weight, quantify_mode):
"Type the path or select from the dropdown"
),
choices=list_llama_models(),
- value="checkpoints/fish-speech-1.2-sft",
+ value="checkpoints/fish-speech-1.4",
allow_custom_value=True,
interactive=True,
)
@@ -1042,7 +1044,7 @@ def llama_quantify(llama_weight, quantify_mode):
"Type the path or select from the dropdown"
),
choices=list_decoder_models(),
- value="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ value="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
allow_custom_value=True,
)
infer_decoder_config = gr.Dropdown(
@@ -1060,7 +1062,7 @@ def llama_quantify(llama_weight, quantify_mode):
info=i18n(
"Type the path or select from the dropdown"
),
- value="checkpoints/fish-speech-1.2-sft",
+ value="checkpoints/fish-speech-1.4",
choices=list_llama_models(),
allow_custom_value=True,
)
diff --git a/xinference/thirdparty/fish_speech/tools/api.py b/xinference/thirdparty/fish_speech/tools/api.py
index 29869b267f..7fcc9330ae 100644
--- a/xinference/thirdparty/fish_speech/tools/api.py
+++ b/xinference/thirdparty/fish_speech/tools/api.py
@@ -9,16 +9,20 @@
from argparse import ArgumentParser
from http import HTTPStatus
from pathlib import Path
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Any, Literal, Optional
import numpy as np
+import ormsgpack
# import pyrootutils
import soundfile as sf
import torch
import torchaudio
+# from baize.datastructures import ContentType
# from kui.asgi import (
# Body,
+# FactoryClass,
# HTTPException,
+# HttpRequest,
# HttpView,
# JSONResponse,
# Kui,
@@ -27,14 +31,16 @@
# )
# from kui.asgi.routing import MultimethodRoutes
from loguru import logger
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, conint
# pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
# from fish_speech.models.vqgan.lit_module import VQGAN
from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
from fish_speech.utils import autocast_exclude_mps
-# from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
from tools.llama.generate import (
GenerateRequest,
GenerateResponse,
@@ -82,11 +88,8 @@ async def other_exception_handler(exc: "Exception"):
def load_audio(reference_audio, sr):
if len(reference_audio) > 255 or not Path(reference_audio).exists():
- try:
- audio_data = base64.b64decode(reference_audio)
- reference_audio = io.BytesIO(audio_data)
- except base64.binascii.Error:
- raise ValueError("Invalid path or base64 string")
+ audio_data = reference_audio
+ reference_audio = io.BytesIO(audio_data)
waveform, original_sr = torchaudio.load(
reference_audio, backend="sox" if sys.platform == "linux" else "soundfile"
@@ -145,7 +148,7 @@ def decode_vq_tokens(
return decoder_model.decode(
indices=codes[None],
feature_lengths=feature_lengths,
- ).squeeze()
+ )[0].squeeze()
raise ValueError(f"Unknown model type: {type(decoder_model)}")
@@ -153,58 +156,6 @@ def decode_vq_tokens(
# routes = MultimethodRoutes(base_class=HttpView)
-def get_random_paths(base_path, data, speaker, emotion):
- if base_path and data and speaker and emotion and (Path(base_path).exists()):
- if speaker in data and emotion in data[speaker]:
- files = data[speaker][emotion]
- lab_files = [f for f in files if f.endswith(".lab")]
- wav_files = [f for f in files if f.endswith(".wav")]
-
- if lab_files and wav_files:
- selected_lab = random.choice(lab_files)
- selected_wav = random.choice(wav_files)
-
- lab_path = Path(base_path) / speaker / emotion / selected_lab
- wav_path = Path(base_path) / speaker / emotion / selected_wav
- if lab_path.exists() and wav_path.exists():
- return lab_path, wav_path
-
- return None, None
-
-
-def load_json(json_file):
- if not json_file:
- logger.info("Not using a json file")
- return None
- try:
- with open(json_file, "r", encoding="utf-8") as file:
- data = json.load(file)
- except FileNotFoundError:
- logger.warning(f"ref json not found: {json_file}")
- data = None
- except Exception as e:
- logger.warning(f"Loading json failed: {e}")
- data = None
- return data
-
-
-class InvokeRequest(BaseModel):
- text: str = "你说的对, 但是原神是一款由米哈游自主研发的开放世界手游."
- reference_text: Optional[str] = None
- reference_audio: Optional[str] = None
- max_new_tokens: int = 1024
- chunk_length: Annotated[int, Field(ge=0, le=500, strict=True)] = 100
- top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
- repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
- temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
- emotion: Optional[str] = None
- format: Literal["wav", "mp3", "flac"] = "wav"
- streaming: bool = False
- ref_json: Optional[str] = "ref_data.json"
- ref_base: Optional[str] = "ref_data"
- speaker: Optional[str] = None
-
-
def get_content_type(audio_format):
if audio_format == "wav":
return "audio/wav"
@@ -217,35 +168,52 @@ def get_content_type(audio_format):
@torch.inference_mode()
-def inference(req: InvokeRequest):
- # Parse reference audio aka prompt
- prompt_tokens = None
-
- ref_data = load_json(req.ref_json)
- ref_base = req.ref_base
-
- lab_path, wav_path = get_random_paths(ref_base, ref_data, req.speaker, req.emotion)
-
- if lab_path and wav_path:
- with open(lab_path, "r", encoding="utf-8") as lab_file:
- ref_text = lab_file.read()
- req.reference_audio = wav_path
- req.reference_text = ref_text
- logger.info("ref_path: " + str(wav_path))
- logger.info("ref_text: " + ref_text)
-
- # Parse reference audio aka prompt
- prompt_tokens = encode_reference(
- decoder_model=decoder_model,
- reference_audio=req.reference_audio,
- enable_reference_audio=req.reference_audio is not None,
- )
- logger.info(f"ref_text: {req.reference_text}")
+def inference(req: ServeTTSRequest):
+
+ idstr: str | None = req.reference_id
+ if idstr is not None:
+ ref_folder = Path("references") / idstr
+ ref_folder.mkdir(parents=True, exist_ok=True)
+ ref_audios = list_files(
+ ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
+ )
+ prompt_tokens = [
+ encode_reference(
+ decoder_model=decoder_model,
+ reference_audio=audio_to_bytes(str(ref_audio)),
+ enable_reference_audio=True,
+ )
+ for ref_audio in ref_audios
+ ]
+ prompt_texts = [
+ read_ref_text(str(ref_audio.with_suffix(".lab")))
+ for ref_audio in ref_audios
+ ]
+
+ else:
+ # Parse reference audio aka prompt
+ refs = req.references
+ if refs is None:
+ refs = []
+ prompt_tokens = [
+ encode_reference(
+ decoder_model=decoder_model,
+ reference_audio=ref.audio,
+ enable_reference_audio=True,
+ )
+ for ref in refs
+ ]
+ prompt_texts = [ref.text for ref in refs]
+
# LLAMA Inference
request = dict(
device=decoder_model.device,
max_new_tokens=req.max_new_tokens,
- text=req.text,
+ text=(
+ req.text
+ if not req.normalize
+ else ChnNormedText(raw_text=req.text).normalize()
+ ),
top_p=req.top_p,
repetition_penalty=req.repetition_penalty,
temperature=req.temperature,
@@ -254,7 +222,7 @@ def inference(req: InvokeRequest):
chunk_length=req.chunk_length,
max_length=2048,
prompt_tokens=prompt_tokens,
- prompt_text=req.reference_text,
+ prompt_text=prompt_texts,
)
response_queue = queue.Queue()
@@ -307,40 +275,7 @@ def inference(req: InvokeRequest):
yield fake_audios
-def auto_rerank_inference(req: InvokeRequest, use_auto_rerank: bool = True):
- if not use_auto_rerank:
- # 如果不使用 auto_rerank,直接调用原始的 inference 函数
- return inference(req)
-
- zh_model, en_model = load_model()
- max_attempts = 5
- best_wer = float("inf")
- best_audio = None
-
- for attempt in range(max_attempts):
- # 调用原始的 inference 函数
- audio_generator = inference(req)
- fake_audios = next(audio_generator)
-
- asr_result = batch_asr(
- zh_model if is_chinese(req.text) else en_model, [fake_audios], 44100
- )[0]
- wer = calculate_wer(req.text, asr_result["text"])
-
- if wer <= 0.1 and not asr_result["huge_gap"]:
- return fake_audios
-
- if wer < best_wer:
- best_wer = wer
- best_audio = fake_audios
-
- if attempt == max_attempts - 1:
- break
-
- return best_audio
-
-
-async def inference_async(req: InvokeRequest):
+async def inference_async(req: ServeTTSRequest):
for chunk in inference(req):
yield chunk
@@ -349,9 +284,9 @@ async def buffer_to_async_generator(buffer):
yield buffer
-# @routes.http.post("/v1/invoke")
+# @routes.http.post("/v1/tts")
# async def api_invoke_model(
-# req: Annotated[InvokeRequest, Body(exclusive=True)],
+# req: Annotated[ServeTTSRequest, Body(exclusive=True)],
# ):
# """
# Invoke model and generate audio
@@ -410,21 +345,20 @@ def parse_args():
parser.add_argument(
"--llama-checkpoint-path",
type=str,
- default="checkpoints/fish-speech-1.2-sft",
+ default="checkpoints/fish-speech-1.4",
)
parser.add_argument(
"--decoder-checkpoint-path",
type=str,
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
)
parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--half", action="store_true")
parser.add_argument("--compile", action="store_true")
parser.add_argument("--max-text-length", type=int, default=0)
- parser.add_argument("--listen", type=str, default="127.0.0.1:8000")
+ parser.add_argument("--listen", type=str, default="127.0.0.1:8080")
parser.add_argument("--workers", type=int, default=1)
- parser.add_argument("--use-auto-rerank", type=bool, default=True)
return parser.parse_args()
@@ -436,18 +370,30 @@ def parse_args():
# },
# ).routes
#
+#
+# class MsgPackRequest(HttpRequest):
+# async def data(self) -> Annotated[Any, ContentType("application/msgpack")]:
+# if self.content_type == "application/msgpack":
+# return ormsgpack.unpackb(await self.body)
+#
+# raise HTTPException(
+# HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
+# headers={"Accept": "application/msgpack"},
+# )
+#
+#
# app = Kui(
# routes=routes + openapi[1:], # Remove the default route
# exception_handlers={
# HTTPException: http_execption_handler,
# Exception: other_exception_handler,
# },
+# factory_class=FactoryClass(http=MsgPackRequest),
# cors_config={},
# )
if __name__ == "__main__":
- import threading
import uvicorn
@@ -474,18 +420,17 @@ def parse_args():
# Dry run to check if the model is loaded correctly and avoid the first-time latency
list(
inference(
- InvokeRequest(
+ ServeTTSRequest(
text="Hello world.",
- reference_text=None,
- reference_audio=None,
- max_new_tokens=0,
+ references=[],
+ reference_id=None,
+ max_new_tokens=1024,
+ chunk_length=200,
top_p=0.7,
repetition_penalty=1.2,
temperature=0.7,
emotion=None,
format="wav",
- ref_base=None,
- ref_json=None,
)
)
)
diff --git a/xinference/thirdparty/fish_speech/tools/auto_rerank.py b/xinference/thirdparty/fish_speech/tools/auto_rerank.py
deleted file mode 100644
index 0297d63d77..0000000000
--- a/xinference/thirdparty/fish_speech/tools/auto_rerank.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import os
-
-os.environ["MODELSCOPE_CACHE"] = ".cache/"
-
-import string
-import time
-from threading import Lock
-
-import librosa
-import numpy as np
-import opencc
-import torch
-from faster_whisper import WhisperModel
-
-t2s_converter = opencc.OpenCC("t2s")
-
-
-def load_model(*, device="cuda"):
- model = WhisperModel(
- "medium",
- device=device,
- compute_type="float16",
- download_root="faster_whisper",
- )
- print("faster_whisper loaded!")
- return model
-
-
-@torch.no_grad()
-def batch_asr_internal(model: WhisperModel, audios, sr):
- resampled_audios = []
- for audio in audios:
-
- if isinstance(audio, np.ndarray):
- audio = torch.from_numpy(audio).float()
-
- if audio.dim() > 1:
- audio = audio.squeeze()
-
- assert audio.dim() == 1
- audio_np = audio.numpy()
- resampled_audio = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
- resampled_audios.append(resampled_audio)
-
- trans_results = []
-
- for resampled_audio in resampled_audios:
- segments, info = model.transcribe(
- resampled_audio,
- language=None,
- beam_size=5,
- initial_prompt="Punctuation is needed in any language.",
- )
- trans_results.append(list(segments))
-
- results = []
- for trans_res, audio in zip(trans_results, audios):
-
- duration = len(audio) / sr * 1000
- huge_gap = False
- max_gap = 0.0
-
- text = None
- last_tr = None
-
- for tr in trans_res:
- delta = tr.text.strip()
- if tr.id > 1:
- max_gap = max(tr.start - last_tr.end, max_gap)
- text += delta
- else:
- text = delta
-
- last_tr = tr
- if max_gap > 3.0:
- huge_gap = True
- break
-
- sim_text = t2s_converter.convert(text)
- results.append(
- {
- "text": sim_text,
- "duration": duration,
- "huge_gap": huge_gap,
- }
- )
-
- return results
-
-
-global_lock = Lock()
-
-
-def batch_asr(model, audios, sr):
- return batch_asr_internal(model, audios, sr)
-
-
-def is_chinese(text):
- return True
-
-
-def calculate_wer(text1, text2, debug=False):
- chars1 = remove_punctuation(text1)
- chars2 = remove_punctuation(text2)
-
- m, n = len(chars1), len(chars2)
-
- if m > n:
- chars1, chars2 = chars2, chars1
- m, n = n, m
-
- prev = list(range(m + 1)) # row 0 distance: [0, 1, 2, ...]
- curr = [0] * (m + 1)
-
- for j in range(1, n + 1):
- curr[0] = j
- for i in range(1, m + 1):
- if chars1[i - 1] == chars2[j - 1]:
- curr[i] = prev[i - 1]
- else:
- curr[i] = min(prev[i], curr[i - 1], prev[i - 1]) + 1
- prev, curr = curr, prev
-
- edits = prev[m]
- tot = max(len(chars1), len(chars2))
- wer = edits / tot
-
- if debug:
- print(" gt: ", chars1)
- print(" pred: ", chars2)
- print(" edits/tot = wer: ", edits, "/", tot, "=", wer)
-
- return wer
-
-
-def remove_punctuation(text):
- chinese_punctuation = (
- " \n\t”“!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—"
- '‛""„‟…‧﹏'
- )
- all_punctuation = string.punctuation + chinese_punctuation
- translator = str.maketrans("", "", all_punctuation)
- text_without_punctuation = text.translate(translator)
- return text_without_punctuation
-
-
-if __name__ == "__main__":
- model = load_model()
- audios = [
- librosa.load("44100.wav", sr=44100)[0],
- librosa.load("lengyue.wav", sr=44100)[0],
- ]
- print(np.array(audios[0]))
- print(batch_asr(model, audios, 44100))
-
- start_time = time.time()
- for _ in range(10):
- print(batch_asr(model, audios, 44100))
- print("Time taken:", time.time() - start_time)
diff --git a/xinference/thirdparty/fish_speech/tools/commons.py b/xinference/thirdparty/fish_speech/tools/commons.py
new file mode 100644
index 0000000000..f81cadec1e
--- /dev/null
+++ b/xinference/thirdparty/fish_speech/tools/commons.py
@@ -0,0 +1,35 @@
+from typing import Annotated, Literal, Optional
+
+from pydantic import BaseModel, Field, conint
+
+
+class ServeReferenceAudio(BaseModel):
+ audio: bytes
+ text: str
+
+
+class ServeTTSRequest(BaseModel):
+ text: str
+ chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
+ # Audio format
+ format: Literal["wav", "pcm", "mp3"] = "wav"
+ mp3_bitrate: Literal[64, 128, 192] = 128
+ # References audios for in-context learning
+ references: list[ServeReferenceAudio] = []
+ # Reference id
+ # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
+ # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
+ reference_id: str | None = None
+ # Normalize text for en & zh, this increase stability for numbers
+ normalize: bool = True
+ mp3_bitrate: Optional[int] = 64
+ opus_bitrate: Optional[int] = -1000
+ # Balance mode will reduce latency to 300ms, but may decrease stability
+ latency: Literal["normal", "balanced"] = "normal"
+ # not usually used below
+ streaming: bool = False
+ emotion: Optional[str] = None
+ max_new_tokens: int = 1024
+ top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+ repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+ temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
diff --git a/xinference/thirdparty/fish_speech/tools/download_models.py b/xinference/thirdparty/fish_speech/tools/download_models.py
index 480f3be0f4..9e79c34c43 100644
--- a/xinference/thirdparty/fish_speech/tools/download_models.py
+++ b/xinference/thirdparty/fish_speech/tools/download_models.py
@@ -22,8 +22,8 @@ def check_and_download_files(repo_id, file_list, local_dir):
# 1st
-repo_id_1 = "fishaudio/fish-speech-1.2-sft"
-local_dir_1 = "./checkpoints/fish-speech-1.2-sft"
+repo_id_1 = "fishaudio/fish-speech-1.4"
+local_dir_1 = "./checkpoints/fish-speech-1.4"
files_1 = [
"model.pth",
"README.md",
@@ -31,7 +31,7 @@ def check_and_download_files(repo_id, file_list, local_dir):
"tokenizer_config.json",
"tokenizer.json",
"config.json",
- "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
]
# 3rd
diff --git a/xinference/thirdparty/fish_speech/tools/file.py b/xinference/thirdparty/fish_speech/tools/file.py
index b4b8051d6f..f7a0597365 100644
--- a/xinference/thirdparty/fish_speech/tools/file.py
+++ b/xinference/thirdparty/fish_speech/tools/file.py
@@ -1,3 +1,4 @@
+import base64
from pathlib import Path
from typing import Union
@@ -23,6 +24,22 @@
}
+def audio_to_bytes(file_path):
+ if not file_path or not Path(file_path).exists():
+ return None
+ with open(file_path, "rb") as wav_file:
+ wav = wav_file.read()
+ return wav
+
+
+def read_ref_text(ref_text):
+ path = Path(ref_text)
+ if path.exists() and path.is_file():
+ with path.open("r", encoding="utf-8") as file:
+ return file.read()
+ return ref_text
+
+
def list_files(
path: Union[Path, str],
extensions: set[str] = None,
diff --git a/xinference/thirdparty/fish_speech/tools/gen_ref.py b/xinference/thirdparty/fish_speech/tools/gen_ref.py
deleted file mode 100644
index a771903b02..0000000000
--- a/xinference/thirdparty/fish_speech/tools/gen_ref.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import json
-from pathlib import Path
-
-
-def scan_folder(base_path):
- wav_lab_pairs = {}
-
- base = Path(base_path)
- for suf in ["wav", "lab"]:
- for f in base.rglob(f"*.{suf}"):
- relative_path = f.relative_to(base)
- parts = relative_path.parts
- print(parts)
- if len(parts) >= 3:
- character = parts[0]
- emotion = parts[1]
-
- if character not in wav_lab_pairs:
- wav_lab_pairs[character] = {}
- if emotion not in wav_lab_pairs[character]:
- wav_lab_pairs[character][emotion] = []
- wav_lab_pairs[character][emotion].append(str(f.name))
-
- return wav_lab_pairs
-
-
-def save_to_json(data, output_file):
- with open(output_file, "w", encoding="utf-8") as file:
- json.dump(data, file, ensure_ascii=False, indent=2)
-
-
-base_path = "ref_data"
-out_ref_file = "ref_data.json"
-
-wav_lab_pairs = scan_folder(base_path)
-save_to_json(wav_lab_pairs, out_ref_file)
diff --git a/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py b/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py
index 20e2219956..fc5ef120cc 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py
@@ -13,7 +13,7 @@
from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData
from fish_speech.datasets.protos.text_data_stream import pack_pb_stream
-from fish_speech.utils.file import load_filelist
+from tools.file import load_filelist
# To avoid CPU overload
os.environ["MKL_NUM_THREADS"] = "1"
diff --git a/xinference/thirdparty/fish_speech/tools/llama/generate.py b/xinference/thirdparty/fish_speech/tools/llama/generate.py
index 934c185145..ad9c549996 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/generate.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/generate.py
@@ -2,6 +2,7 @@
import queue
import threading
import time
+from contextlib import nullcontext
from dataclasses import dataclass
from pathlib import Path
from typing import Literal, Optional, Tuple, Union
@@ -93,15 +94,20 @@ def decode_one_token_ar(
**sampling_kwargs,
) -> torch.Tensor:
x = model.forward_generate(x, input_pos)
+
+ sampling_kwargs_main = sampling_kwargs.copy()
+ sampling_kwargs_main["temperature"] = 0.1
+ sampling_kwargs_main["top_p"] = 0.1
+ sampling_kwargs_main["repetition_penalty"] = 1.0
+
codebooks = [
sample(
x.logits,
- previous_tokens=(
- previous_tokens[0] if previous_tokens is not None else None
- ), # Disable repetition penalty for the token codebook
- **sampling_kwargs,
+ previous_tokens=None, # Disable repetition penalty for the token codebook
+ **sampling_kwargs_main,
)[0]
]
+
x = x.hidden_states
# Cleanup the cache
@@ -136,11 +142,16 @@ def decode_one_token_naive(
) -> torch.Tensor:
x = model.forward_generate(x, input_pos)
+ sampling_kwargs_main = sampling_kwargs.copy()
+ sampling_kwargs_main["temperature"] = 0.1
+ sampling_kwargs_main["top_p"] = 0.1
+ sampling_kwargs_main["repetition_penalty"] = 1.0
+
codebooks = [
sample(
- x.token_logits,
+ x.logits,
previous_tokens=None, # Disable repetition penalty for the token codebook
- **sampling_kwargs,
+ **sampling_kwargs_main,
)[0]
]
@@ -181,8 +192,12 @@ def decode_n_tokens(
else:
window = previous_tokens[:, i - win_size : i]
- with torch.backends.cuda.sdp_kernel(
- enable_flash=False, enable_mem_efficient=False, enable_math=True
+ with (
+ torch.backends.cuda.sdp_kernel(
+ enable_flash=False, enable_mem_efficient=False, enable_math=True
+ )
+ if torch.cuda.is_available()
+ else nullcontext()
): # Actually better for Inductor to codegen attention here
next_token = decode_one_token(
model=model,
@@ -222,25 +237,11 @@ def generate(
# create an empty tensor of the expected final shape and fill in the current tokens
T = prompt.size(1)
- if max_new_tokens:
- if T + max_new_tokens > model.config.max_seq_len:
- max_new_tokens = model.config.max_seq_len - T
- logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
-
- T_new = T + max_new_tokens
- else:
- T_new = model.config.max_seq_len
- max_new_tokens = T_new - T
-
device, dtype = prompt.device, prompt.dtype
- with torch.device(device):
- model.setup_caches(
- max_batch_size=1, max_seq_len=T_new, dtype=next(model.parameters()).dtype
- )
codebook_dim = 1 + model.config.num_codebooks
# create an empty tensor of the expected final shape and fill in the current tokens
- empty = torch.empty((codebook_dim, T_new), dtype=dtype, device=device)
+ empty = torch.empty((codebook_dim, max_new_tokens), dtype=dtype, device=device)
empty[:, :T] = prompt
seq = empty
input_pos = torch.arange(0, T, device=device)
@@ -560,6 +561,10 @@ def worker():
model, decode_one_token = load_model(
checkpoint_path, device, precision, compile=compile
)
+ with torch.device(device):
+ model.setup_caches(
+ max_batch_size=1, max_seq_len=2048, dtype=next(model.parameters()).dtype
+ )
init_event.set()
while True:
@@ -607,7 +612,7 @@ def worker():
@click.option(
"--checkpoint-path",
type=click.Path(path_type=Path, exists=True),
- default="checkpoints/fish-speech-1.2-sft",
+ default="checkpoints/fish-speech-1.4",
)
@click.option("--device", type=str, default="cuda")
@click.option("--compile/--no-compile", default=False)
diff --git a/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py b/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py
index f12eece8d2..c1bd3cbd72 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py
@@ -15,7 +15,7 @@
@click.command()
@click.option("--lora-config", type=str, default="r_8_alpha_16")
-@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.2-sft")
+@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.4")
@click.option("--lora-weight", type=str, required=True)
@click.option("--output", type=str, required=True)
def merge(lora_config, base_weight, lora_weight, output):
diff --git a/xinference/thirdparty/fish_speech/tools/llama/quantize.py b/xinference/thirdparty/fish_speech/tools/llama/quantize.py
index aae32fcce7..e629d944b5 100644
--- a/xinference/thirdparty/fish_speech/tools/llama/quantize.py
+++ b/xinference/thirdparty/fish_speech/tools/llama/quantize.py
@@ -428,7 +428,7 @@ def generate_folder_name():
@click.option(
"--checkpoint-path",
type=click.Path(path_type=Path, exists=True),
- default="checkpoints/fish-speech-1.2-sft",
+ default="checkpoints/fish-speech-1.4",
)
@click.option(
"--mode", type=str, default="int8", help="type of quantization to perform"
@@ -451,7 +451,7 @@ def quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) -
precision=precision,
compile=False,
)
- vq_model = "firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
+ vq_model = "firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
now = timestamp if timestamp != "None" else generate_folder_name()
if mode == "int8":
diff --git a/xinference/thirdparty/fish_speech/tools/merge_asr_files.py b/xinference/thirdparty/fish_speech/tools/merge_asr_files.py
deleted file mode 100644
index cc12062095..0000000000
--- a/xinference/thirdparty/fish_speech/tools/merge_asr_files.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-from pathlib import Path
-
-from pydub import AudioSegment
-from tqdm import tqdm
-
-from tools.file import AUDIO_EXTENSIONS, list_files
-
-
-def merge_and_delete_files(save_dir, original_files):
- save_path = Path(save_dir)
- audio_slice_files = list_files(
- path=save_dir, extensions=AUDIO_EXTENSIONS.union([".lab"]), recursive=True
- )
- audio_files = {}
- label_files = {}
- for file_path in tqdm(audio_slice_files, desc="Merging audio files"):
- rel_path = Path(file_path).relative_to(save_path)
- (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
- if file_path.suffix == ".wav":
- prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0]
- if prefix == rel_path.parent / file_path.stem:
- continue
- audio = AudioSegment.from_wav(file_path)
- if prefix in audio_files.keys():
- audio_files[prefix] = audio_files[prefix] + audio
- else:
- audio_files[prefix] = audio
-
- elif file_path.suffix == ".lab":
- prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0]
- if prefix == rel_path.parent / file_path.stem:
- continue
- with open(file_path, "r", encoding="utf-8") as f:
- label = f.read()
- if prefix in label_files.keys():
- label_files[prefix] = label_files[prefix] + ", " + label
- else:
- label_files[prefix] = label
-
- for prefix, audio in audio_files.items():
- output_audio_path = save_path / f"{prefix}.wav"
- audio.export(output_audio_path, format="wav")
-
- for prefix, label in label_files.items():
- output_label_path = save_path / f"{prefix}.lab"
- with open(output_label_path, "w", encoding="utf-8") as f:
- f.write(label)
-
- for file_path in original_files:
- os.remove(file_path)
-
-
-if __name__ == "__main__":
- merge_and_delete_files("/made/by/spicysama/laziman", [__file__])
diff --git a/xinference/thirdparty/fish_speech/tools/msgpack_api.py b/xinference/thirdparty/fish_speech/tools/msgpack_api.py
new file mode 100644
index 0000000000..67f907bf55
--- /dev/null
+++ b/xinference/thirdparty/fish_speech/tools/msgpack_api.py
@@ -0,0 +1,34 @@
+import httpx
+import ormsgpack
+
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+
+# priority: ref_id > references
+request = ServeTTSRequest(
+ text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+ # reference_id="114514",
+ references=[
+ ServeReferenceAudio(
+ audio=open("lengyue.wav", "rb").read(),
+ text=open("lengyue.lab", "r", encoding="utf-8").read(),
+ )
+ ],
+ streaming=True,
+)
+
+with (
+ httpx.Client() as client,
+ open("hello.wav", "wb") as f,
+):
+ with client.stream(
+ "POST",
+ "http://127.0.0.1:8080/v1/tts",
+ content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+ headers={
+ "authorization": "Bearer YOUR_API_KEY",
+ "content-type": "application/msgpack",
+ },
+ timeout=None,
+ ) as response:
+ for chunk in response.iter_bytes():
+ f.write(chunk)
diff --git a/xinference/thirdparty/fish_speech/tools/post_api.py b/xinference/thirdparty/fish_speech/tools/post_api.py
index 153893078e..c20dc455c3 100644
--- a/xinference/thirdparty/fish_speech/tools/post_api.py
+++ b/xinference/thirdparty/fish_speech/tools/post_api.py
@@ -1,40 +1,19 @@
import argparse
import base64
-import json
import wave
-from pathlib import Path
+import ormsgpack
import pyaudio
import requests
+from pydub import AudioSegment
+from pydub.playback import play
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+from tools.file import audio_to_bytes, read_ref_text
-def wav_to_base64(file_path):
- if not file_path or not Path(file_path).exists():
- return None
- with open(file_path, "rb") as wav_file:
- wav_content = wav_file.read()
- base64_encoded = base64.b64encode(wav_content)
- return base64_encoded.decode("utf-8")
+def parse_args():
-def read_ref_text(ref_text):
- path = Path(ref_text)
- if path.exists() and path.is_file():
- with path.open("r", encoding="utf-8") as file:
- return file.read()
- return ref_text
-
-
-def play_audio(audio_content, format, channels, rate):
- p = pyaudio.PyAudio()
- stream = p.open(format=format, channels=channels, rate=rate, output=True)
- stream.write(audio_content)
- stream.stop_stream()
- stream.close()
- p.terminate()
-
-
-if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Send a WAV file and text to a server and receive synthesized audio."
)
@@ -43,16 +22,24 @@ def play_audio(audio_content, format, channels, rate):
"--url",
"-u",
type=str,
- default="http://127.0.0.1:8080/v1/invoke",
+ default="http://127.0.0.1:8080/v1/tts",
help="URL of the server",
)
parser.add_argument(
"--text", "-t", type=str, required=True, help="Text to be synthesized"
)
+ parser.add_argument(
+ "--reference_id",
+ "-id",
+ type=str,
+ default=None,
+ help="ID of the reference model o be used for the speech",
+ )
parser.add_argument(
"--reference_audio",
"-ra",
type=str,
+ nargs="+",
default=None,
help="Path to the WAV file",
)
@@ -60,9 +47,30 @@ def play_audio(audio_content, format, channels, rate):
"--reference_text",
"-rt",
type=str,
+ nargs="+",
default=None,
help="Reference text for voice synthesis",
)
+ parser.add_argument(
+ "--output",
+ "-o",
+ type=str,
+ default="generated_audio",
+ help="Output audio file name",
+ )
+ parser.add_argument(
+ "--play",
+ type=bool,
+ default=True,
+ help="Whether to play audio after receiving data",
+ )
+ parser.add_argument("--normalize", type=bool, default=True)
+ parser.add_argument(
+ "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
+ )
+ parser.add_argument("--mp3_bitrate", type=int, default=64)
+ parser.add_argument("--opus_bitrate", type=int, default=-1000)
+ parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
parser.add_argument(
"--max_new_tokens",
type=int,
@@ -88,7 +96,6 @@ def play_audio(audio_content, format, channels, rate):
"--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
)
parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
- parser.add_argument("--format", type=str, default="wav", help="Audio format")
parser.add_argument(
"--streaming", type=bool, default=False, help="Enable streaming response"
)
@@ -97,18 +104,42 @@ def play_audio(audio_content, format, channels, rate):
)
parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
- args = parser.parse_args()
+ return parser.parse_args()
- base64_audio = wav_to_base64(args.reference_audio)
- ref_text = args.reference_text
- if ref_text:
- ref_text = read_ref_text(ref_text)
+if __name__ == "__main__":
+
+ args = parse_args()
+
+ idstr: str | None = args.reference_id
+ # priority: ref_id > [{text, audio},...]
+ if idstr is None:
+ ref_audios = args.reference_audio
+ ref_texts = args.reference_text
+ if ref_audios is None:
+ byte_audios = []
+ else:
+ byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
+ if ref_texts is None:
+ ref_texts = []
+ else:
+ ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
+ else:
+ byte_audios = []
+ ref_texts = []
+ pass # in api.py
data = {
"text": args.text,
- "reference_text": ref_text,
- "reference_audio": base64_audio,
+ "references": [
+ ServeReferenceAudio(audio=ref_audio, text=ref_text)
+ for ref_text, ref_audio in zip(ref_texts, byte_audios)
+ ],
+ "reference_id": idstr,
+ "normalize": args.normalize,
+ "format": args.format,
+ "mp3_bitrate": args.mp3_bitrate,
+ "opus_bitrate": args.opus_bitrate,
"max_new_tokens": args.max_new_tokens,
"chunk_length": args.chunk_length,
"top_p": args.top_p,
@@ -116,22 +147,30 @@ def play_audio(audio_content, format, channels, rate):
"temperature": args.temperature,
"speaker": args.speaker,
"emotion": args.emotion,
- "format": args.format,
"streaming": args.streaming,
}
- response = requests.post(args.url, json=data, stream=args.streaming)
+ pydantic_data = ServeTTSRequest(**data)
- audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
+ response = requests.post(
+ args.url,
+ data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+ stream=args.streaming,
+ headers={
+ "authorization": "Bearer YOUR_API_KEY",
+ "content-type": "application/msgpack",
+ },
+ )
if response.status_code == 200:
if args.streaming:
p = pyaudio.PyAudio()
+ audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
stream = p.open(
format=audio_format, channels=args.channels, rate=args.rate, output=True
)
- wf = wave.open("generated_audio.wav", "wb")
+ wf = wave.open(f"{args.output}.wav", "wb")
wf.setnchannels(args.channels)
wf.setsampwidth(p.get_sample_size(audio_format))
wf.setframerate(args.rate)
@@ -153,12 +192,14 @@ def play_audio(audio_content, format, channels, rate):
wf.close()
else:
audio_content = response.content
-
- with open("generated_audio.wav", "wb") as audio_file:
+ audio_path = f"{args.output}.{args.format}"
+ with open(audio_path, "wb") as audio_file:
audio_file.write(audio_content)
- play_audio(audio_content, audio_format, args.channels, args.rate)
- print("Audio has been saved to 'generated_audio.wav'.")
+ audio = AudioSegment.from_file(audio_path, format=args.format)
+ if args.play:
+ play(audio)
+ print(f"Audio has been saved to '{audio_path}'.")
else:
print(f"Request failed with status code {response.status_code}")
print(response.json())
diff --git a/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py b/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py
index 02c15a5976..6789316d51 100644
--- a/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py
+++ b/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py
@@ -26,7 +26,7 @@ def uvr5_cli(
output_folder: Path,
audio_files: list[Path] | None = None,
output_format: str = "flac",
- model: str = "BS-Roformer-Viperx-1296.ckpt",
+ model: str = "BS-Roformer-Viperx-1297.ckpt",
):
# ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
sepr = Separator(
diff --git a/xinference/thirdparty/fish_speech/tools/smart_pad.py b/xinference/thirdparty/fish_speech/tools/smart_pad.py
index 9772168f51..de9dc154f2 100644
--- a/xinference/thirdparty/fish_speech/tools/smart_pad.py
+++ b/xinference/thirdparty/fish_speech/tools/smart_pad.py
@@ -15,21 +15,34 @@
def process(file):
waveform, sample_rate = torchaudio.load(str(file), backend="sox")
+ if waveform.size(0) > 1:
+ waveform = waveform.mean(dim=0, keepdim=True)
+
loudness = librosa.feature.rms(
y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
)[0]
+
for i in range(len(loudness) - 1, 0, -1):
if loudness[i] > threshold:
break
- silent_time = (len(loudness) - i) * 512 / sample_rate
+ end_silent_time = (len(loudness) - i) * 512 / sample_rate
- if silent_time <= 0.3:
- random_time = random.uniform(0.3, 0.7)
+ if end_silent_time <= 0.3:
+ random_time = random.uniform(0.3, 0.7) - end_silent_time
waveform = F.pad(
waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
)
+ for i in range(len(loudness)):
+ if loudness[i] > threshold:
+ break
+
+ start_silent_time = i * 512 / sample_rate
+
+ if start_silent_time > 0.02:
+ waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]
+
torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
diff --git a/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py b/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py
index bc6bc40830..c24eb3f46a 100644
--- a/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py
+++ b/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py
@@ -42,7 +42,7 @@
@lru_cache(maxsize=1)
def get_model(
config_name: str = "firefly_gan_vq",
- checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
device: str | torch.device = "cuda",
):
with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
@@ -133,7 +133,7 @@ def process_batch(files: list[Path], model) -> float:
@click.option("--config-name", default="firefly_gan_vq")
@click.option(
"--checkpoint-path",
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
)
@click.option("--batch-size", default=64)
@click.option("--filelist", default=None, type=Path)
diff --git a/xinference/thirdparty/fish_speech/tools/vqgan/inference.py b/xinference/thirdparty/fish_speech/tools/vqgan/inference.py
index 17c9034d7b..b6bc7531c4 100644
--- a/xinference/thirdparty/fish_speech/tools/vqgan/inference.py
+++ b/xinference/thirdparty/fish_speech/tools/vqgan/inference.py
@@ -59,7 +59,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
@click.option("--config-name", default="firefly_gan_vq")
@click.option(
"--checkpoint-path",
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
)
@click.option(
"--device",
@@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
# Restore
feature_lengths = torch.tensor([indices.shape[1]], device=device)
- fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
+ fake_audios, _ = model.decode(
+ indices=indices[None], feature_lengths=feature_lengths
+ )
audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
logger.info(
diff --git a/xinference/thirdparty/fish_speech/tools/webui.py b/xinference/thirdparty/fish_speech/tools/webui.py
index f64ff923b0..a52f548cc9 100644
--- a/xinference/thirdparty/fish_speech/tools/webui.py
+++ b/xinference/thirdparty/fish_speech/tools/webui.py
@@ -23,7 +23,6 @@
from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
from fish_speech.utils import autocast_exclude_mps
from tools.api import decode_vq_tokens, encode_reference
-from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
from tools.llama.generate import (
GenerateRequest,
GenerateResponse,
@@ -40,9 +39,9 @@
{i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}
-{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}
+{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).")}
-{i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}
+{i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}
{i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}
"""
@@ -160,66 +159,6 @@ def inference(
gc.collect()
-def inference_with_auto_rerank(
- text,
- enable_reference_audio,
- reference_audio,
- reference_text,
- max_new_tokens,
- chunk_length,
- top_p,
- repetition_penalty,
- temperature,
- use_auto_rerank,
- streaming=False,
-):
-
- max_attempts = 2 if use_auto_rerank else 1
- best_wer = float("inf")
- best_audio = None
- best_sample_rate = None
-
- for attempt in range(max_attempts):
- audio_generator = inference(
- text,
- enable_reference_audio,
- reference_audio,
- reference_text,
- max_new_tokens,
- chunk_length,
- top_p,
- repetition_penalty,
- temperature,
- streaming=False,
- )
-
- # 获取音频数据
- for _ in audio_generator:
- pass
- _, (sample_rate, audio), message = _
-
- if audio is None:
- return None, None, message
-
- if not use_auto_rerank:
- return None, (sample_rate, audio), None
-
- asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
- wer = calculate_wer(text, asr_result["text"])
- if wer <= 0.3 and not asr_result["huge_gap"]:
- return None, (sample_rate, audio), None
-
- if wer < best_wer:
- best_wer = wer
- best_audio = audio
- best_sample_rate = sample_rate
-
- if attempt == max_attempts - 1:
- break
-
- return None, (best_sample_rate, best_audio), None
-
-
inference_stream = partial(inference, streaming=True)
n_audios = 4
@@ -239,13 +178,12 @@ def inference_wrapper(
repetition_penalty,
temperature,
batch_infer_num,
- if_load_asr_model,
):
audios = []
errors = []
for _ in range(batch_infer_num):
- result = inference_with_auto_rerank(
+ result = inference(
text,
enable_reference_audio,
reference_audio,
@@ -255,10 +193,9 @@ def inference_wrapper(
top_p,
repetition_penalty,
temperature,
- if_load_asr_model,
)
- _, audio_data, error_message = result
+ _, audio_data, error_message = next(result)
audios.append(
gr.Audio(value=audio_data if audio_data else None, visible=True),
@@ -301,42 +238,6 @@ def normalize_text(user_input, use_normalization):
asr_model = None
-def change_if_load_asr_model(if_load):
- global asr_model
-
- if if_load:
- gr.Warning("Loading faster whisper model...")
- if asr_model is None:
- asr_model = load_model()
- return gr.Checkbox(label="Unload faster whisper model", value=if_load)
-
- if if_load is False:
- gr.Warning("Unloading faster whisper model...")
- del asr_model
- asr_model = None
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
- gc.collect()
- return gr.Checkbox(label="Load faster whisper model", value=if_load)
-
-
-def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
- if if_load and asr_model is not None:
- if (
- if_auto_label
- and enable_ref
- and ref_audio is not None
- and ref_text.strip() == ""
- ):
- data, sample_rate = librosa.load(ref_audio)
- res = batch_asr(asr_model, [data], sample_rate)[0]
- ref_text = res["text"]
- else:
- gr.Warning("Whisper model not loaded!")
-
- return gr.Textbox(value=ref_text)
-
-
def build_app():
with gr.Blocks(theme=gr.themes.Base()) as app:
gr.Markdown(HEADER_MD)
@@ -367,23 +268,17 @@ def build_app():
with gr.Row():
if_refine_text = gr.Checkbox(
label=i18n("Text Normalization"),
- value=True,
- scale=1,
- )
-
- if_load_asr_model = gr.Checkbox(
- label=i18n("Load / Unload ASR model for auto-reranking"),
value=False,
- scale=3,
+ scale=1,
)
with gr.Row():
with gr.Tab(label=i18n("Advanced Config")):
chunk_length = gr.Slider(
label=i18n("Iterative Prompt Length, 0 means off"),
- minimum=0,
- maximum=500,
- value=100,
+ minimum=50,
+ maximum=300,
+ value=200,
step=8,
)
@@ -434,12 +329,6 @@ def build_app():
type="filepath",
)
with gr.Row():
- if_auto_label = gr.Checkbox(
- label=i18n("Auto Labeling"),
- min_width=100,
- scale=0,
- value=False,
- )
reference_text = gr.Textbox(
label=i18n("Reference Text"),
lines=1,
@@ -494,28 +383,6 @@ def build_app():
fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
)
- if_load_asr_model.change(
- fn=change_if_load_asr_model,
- inputs=[if_load_asr_model],
- outputs=[if_load_asr_model],
- )
-
- if_auto_label.change(
- fn=lambda: gr.Textbox(value=""),
- inputs=[],
- outputs=[reference_text],
- ).then(
- fn=change_if_auto_label,
- inputs=[
- if_load_asr_model,
- if_auto_label,
- enable_reference_audio,
- reference_audio,
- reference_text,
- ],
- outputs=[reference_text],
- )
-
# # Submit
generate.click(
inference_wrapper,
@@ -530,7 +397,6 @@ def build_app():
repetition_penalty,
temperature,
batch_infer_num,
- if_load_asr_model,
],
[stream_audio, *global_audio_list, *global_error_list],
concurrency_limit=1,
@@ -560,12 +426,12 @@ def parse_args():
parser.add_argument(
"--llama-checkpoint-path",
type=Path,
- default="checkpoints/fish-speech-1.2-sft",
+ default="checkpoints/fish-speech-1.4",
)
parser.add_argument(
"--decoder-checkpoint-path",
type=Path,
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
)
parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
parser.add_argument("--device", type=str, default="cuda")
@@ -605,8 +471,8 @@ def parse_args():
enable_reference_audio=False,
reference_audio=None,
reference_text="",
- max_new_tokens=0,
- chunk_length=100,
+ max_new_tokens=1024,
+ chunk_length=200,
top_p=0.7,
repetition_penalty=1.2,
temperature=0.7,
From 42d9c340c4ae084c71b215c88d30c2c9d636508e Mon Sep 17 00:00:00 2001
From: Xuye Qin
Date: Fri, 13 Sep 2024 12:16:59 +0800
Subject: [PATCH 04/17] FEAT: support sdapi/img2img (#2293)
---
xinference/api/restful_api.py | 67 ++++++++++++++++++++++++++-
xinference/core/model.py | 14 ++++++
xinference/model/image/sdapi.py | 80 +++++++++++++++++++++++++++------
xinference/types.py | 2 +-
4 files changed, 147 insertions(+), 16 deletions(-)
diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 15b5cc52d4..d12273ba13 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -63,7 +63,7 @@
CreateCompletion,
ImageList,
PeftModelConfig,
- SDAPITxt2imgResult,
+ SDAPIResult,
VideoList,
max_tokens_field,
)
@@ -138,6 +138,24 @@ class SDAPITxt2imgRequst(BaseModel):
width: Optional[int] = 512
height: Optional[int] = 512
sampler_name: Optional[str] = None
+ denoising_strength: Optional[float] = None
+ kwargs: Optional[str] = None
+ user: Optional[str] = None
+
+
+class SDAPIImg2imgRequst(BaseModel):
+ model: Optional[str]
+ init_images: Optional[list]
+ prompt: Optional[str] = ""
+ negative_prompt: Optional[str] = ""
+ steps: Optional[int] = None
+ seed: Optional[int] = -1
+ cfg_scale: Optional[float] = 7.0
+ override_settings: Optional[dict] = {}
+ width: Optional[int] = 512
+ height: Optional[int] = 512
+ sampler_name: Optional[str] = None
+ denoising_strength: Optional[float] = None
kwargs: Optional[str] = None
user: Optional[str] = None
@@ -574,7 +592,18 @@ async def internal_exception_handler(request: Request, exc: Exception):
"/sdapi/v1/txt2img",
self.sdapi_txt2img,
methods=["POST"],
- response_model=SDAPITxt2imgResult,
+ response_model=SDAPIResult,
+ dependencies=(
+ [Security(self._auth_service, scopes=["models:read"])]
+ if self.is_authenticated()
+ else None
+ ),
+ )
+ self._router.add_api_route(
+ "/sdapi/v1/img2img",
+ self.sdapi_img2img,
+ methods=["POST"],
+ response_model=SDAPIResult,
dependencies=(
[Security(self._auth_service, scopes=["models:read"])]
if self.is_authenticated()
@@ -1569,6 +1598,40 @@ async def sdapi_txt2img(self, request: Request) -> Response:
await self._report_error_event(model_uid, str(e))
raise HTTPException(status_code=500, detail=str(e))
+ async def sdapi_img2img(self, request: Request) -> Response:
+ body = SDAPIImg2imgRequst.parse_obj(await request.json())
+ model_uid = body.model or body.override_settings.get("sd_model_checkpoint")
+
+ try:
+ if not model_uid:
+ raise ValueError("Unknown model")
+ model = await (await self._get_supervisor_ref()).get_model(model_uid)
+ except ValueError as ve:
+ logger.error(str(ve), exc_info=True)
+ await self._report_error_event(model_uid, str(ve))
+ raise HTTPException(status_code=400, detail=str(ve))
+ except Exception as e:
+ logger.error(e, exc_info=True)
+ await self._report_error_event(model_uid, str(e))
+ raise HTTPException(status_code=500, detail=str(e))
+
+ try:
+ kwargs = dict(body)
+ kwargs.update(json.loads(body.kwargs) if body.kwargs else {})
+ image_list = await model.img2img(
+ **kwargs,
+ )
+ return Response(content=image_list, media_type="application/json")
+ except RuntimeError as re:
+ logger.error(re, exc_info=True)
+ await self._report_error_event(model_uid, str(re))
+ self.handle_request_limit_error(re)
+ raise HTTPException(status_code=400, detail=str(re))
+ except Exception as e:
+ logger.error(e, exc_info=True)
+ await self._report_error_event(model_uid, str(e))
+ raise HTTPException(status_code=500, detail=str(e))
+
async def create_variations(
self,
model: str = Form(...),
diff --git a/xinference/core/model.py b/xinference/core/model.py
index 327582163c..1f711fb117 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -793,6 +793,20 @@ async def image_to_image(
f"Model {self._model.model_spec} is not for creating image."
)
+ @request_limit
+ @log_async(logger=logger)
+ async def img2img(
+ self,
+ **kwargs,
+ ):
+ kwargs.pop("request_id", None)
+ if hasattr(self._model, "img2img"):
+ return await self._call_wrapper_json(
+ self._model.img2img,
+ **kwargs,
+ )
+ raise AttributeError(f"Model {self._model.model_spec} is not for img2img.")
+
@log_async(
logger=logger,
ignore_kwargs=["image"],
diff --git a/xinference/model/image/sdapi.py b/xinference/model/image/sdapi.py
index 10337b114d..b3af166299 100644
--- a/xinference/model/image/sdapi.py
+++ b/xinference/model/image/sdapi.py
@@ -11,30 +11,48 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
+import base64
+import io
import warnings
+from PIL import Image
+
class SDAPIToDiffusersConverter:
- txt2img_identical_args = [
+ txt2img_identical_args = {
"prompt",
"negative_prompt",
"seed",
"width",
"height",
"sampler_name",
- ]
+ }
txt2img_arg_mapping = {
"steps": "num_inference_steps",
"cfg_scale": "guidance_scale",
+ "denoising_strength": "strength",
+ }
+ img2img_identical_args = {
+ "prompt",
+ "negative_prompt",
+ "seed",
+ "width",
+ "height",
+ "sampler_name",
+ }
+ img2img_arg_mapping = {
+ "init_images": "image",
+ "steps": "num_inference_steps",
+ "cfg_scale": "guidance_scale",
+ "denoising_strength": "strength",
}
@staticmethod
- def convert_txt2img_to_diffusers(params: dict) -> dict:
+ def convert_to_diffusers(sd_type: str, params: dict) -> dict:
diffusers_params = {}
- identical_args = set(SDAPIToDiffusersConverter.txt2img_identical_args)
- mapping_args = SDAPIToDiffusersConverter.txt2img_arg_mapping
+ identical_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_identical_args")
+ mapping_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_arg_mapping")
for param, value in params.items():
if param in identical_args:
diffusers_params[param] = value
@@ -45,13 +63,17 @@ def convert_txt2img_to_diffusers(params: dict) -> dict:
return diffusers_params
+ @staticmethod
+ def get_available_args(sd_type: str) -> set:
+ identical_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_identical_args")
+ mapping_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_arg_mapping")
+ return identical_args.union(mapping_args)
+
class SDAPIDiffusionModelMixin:
- def txt2img(self, **kwargs):
- available_args = set(
- SDAPIToDiffusersConverter.txt2img_identical_args
- + list(SDAPIToDiffusersConverter.txt2img_arg_mapping)
- )
+ @staticmethod
+ def _check_kwargs(sd_type: str, kwargs: dict):
+ available_args = SDAPIToDiffusersConverter.get_available_args(sd_type)
unknown_args = []
available_kwargs = {}
for arg, value in kwargs.items():
@@ -64,14 +86,20 @@ def txt2img(self, **kwargs):
f"Some args are not supported for now and will be ignored: {unknown_args}"
)
- converted_kwargs = SDAPIToDiffusersConverter.convert_txt2img_to_diffusers(
- available_kwargs
+ converted_kwargs = SDAPIToDiffusersConverter.convert_to_diffusers(
+ sd_type, available_kwargs
)
+
width, height = converted_kwargs.pop("width", None), converted_kwargs.pop(
"height", None
)
if width and height:
converted_kwargs["size"] = f"{width}*{height}"
+
+ return converted_kwargs
+
+ def txt2img(self, **kwargs):
+ converted_kwargs = self._check_kwargs("txt2img", kwargs)
result = self.text_to_image(response_format="b64_json", **converted_kwargs) # type: ignore
# convert to SD API result
@@ -80,3 +108,29 @@ def txt2img(self, **kwargs):
"info": {"created": result["created"]},
"parameters": {},
}
+
+ @staticmethod
+ def _decode_b64_img(img_str: str) -> Image:
+ # img_str in a format: "data:image/png;base64," + raw_b64_img(image)
+ f, data = img_str.split(",", 1)
+ f, encode_type = f.split(";", 1)
+ assert encode_type == "base64"
+ f = f.split("/", 1)[1]
+ b = base64.b64decode(data)
+ return Image.open(io.BytesIO(b), formats=[f])
+
+ def img2img(self, **kwargs):
+ init_images = kwargs.pop("init_images", [])
+ kwargs["init_images"] = [self._decode_b64_img(i) for i in init_images]
+ clip_skip = kwargs.get("override_settings", {}).get("clip_skip")
+ converted_kwargs = self._check_kwargs("img2img", kwargs)
+ if clip_skip:
+ converted_kwargs["clip_skip"] = clip_skip
+ result = self.image_to_image(response_format="b64_json", **converted_kwargs) # type: ignore
+
+ # convert to SD API result
+ return {
+ "images": [r["b64_json"] for r in result["data"]],
+ "info": {"created": result["created"]},
+ "parameters": {},
+ }
diff --git a/xinference/types.py b/xinference/types.py
index 31c0c28635..613d8709bb 100644
--- a/xinference/types.py
+++ b/xinference/types.py
@@ -47,7 +47,7 @@ class ImageList(TypedDict):
data: List[Image]
-class SDAPITxt2imgResult(TypedDict):
+class SDAPIResult(TypedDict):
images: List[str]
parameters: dict
info: dict
From a9380becb24eec1e747a83cf7319a895c5dc3e71 Mon Sep 17 00:00:00 2001
From: Xuye Qin
Date: Fri, 13 Sep 2024 15:24:52 +0800
Subject: [PATCH 05/17] FEAT: support flux.1 image2image and inpainting (#2296)
---
.github/workflows/python.yaml | 1 +
xinference/model/image/model_spec.json | 8 ++++++--
.../model/image/model_spec_modelscope.json | 8 ++++++--
.../model/image/stable_diffusion/core.py | 20 ++++++++++++++++---
4 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 5be70aa4a0..5c75b2814c 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -171,6 +171,7 @@ jobs:
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loguru"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "natsort"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loralib"
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ormsgpack"
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y opencc
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
diff --git a/xinference/model/image/model_spec.json b/xinference/model/image/model_spec.json
index 891e9d5765..04386dd2e5 100644
--- a/xinference/model/image/model_spec.json
+++ b/xinference/model/image/model_spec.json
@@ -5,7 +5,9 @@
"model_id": "black-forest-labs/FLUX.1-schnell",
"model_revision": "768d12a373ed5cc9ef9a9dea7504dc09fcc14842",
"model_ability": [
- "text2image"
+ "text2image",
+ "image2image",
+ "inpainting"
]
},
{
@@ -14,7 +16,9 @@
"model_id": "black-forest-labs/FLUX.1-dev",
"model_revision": "01aa605f2c300568dd6515476f04565a954fcb59",
"model_ability": [
- "text2image"
+ "text2image",
+ "image2image",
+ "inpainting"
]
},
{
diff --git a/xinference/model/image/model_spec_modelscope.json b/xinference/model/image/model_spec_modelscope.json
index bbc5d57010..b39bfc543d 100644
--- a/xinference/model/image/model_spec_modelscope.json
+++ b/xinference/model/image/model_spec_modelscope.json
@@ -6,7 +6,9 @@
"model_id": "AI-ModelScope/FLUX.1-schnell",
"model_revision": "master",
"model_ability": [
- "text2image"
+ "text2image",
+ "image2image",
+ "inpainting"
]
},
{
@@ -16,7 +18,9 @@
"model_id": "AI-ModelScope/FLUX.1-dev",
"model_revision": "master",
"model_ability": [
- "text2image"
+ "text2image",
+ "image2image",
+ "inpainting"
]
},
{
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index 5344e62de2..eed9739b2c 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -14,6 +14,7 @@
import base64
import contextlib
+import inspect
import logging
import os
import re
@@ -408,12 +409,24 @@ def image_to_image(
width, height = image.size
kwargs["width"] = width
kwargs["height"] = height
-
+ else:
+ # SD3 image2image cannot accept width and height
+ parameters = inspect.signature(model.__call__).parameters # type: ignore
+ allow_width_height = False
+ for param in parameters.values():
+ if param.kind == inspect.Parameter.VAR_KEYWORD:
+ allow_width_height = True
+ break
+ if "width" in parameters or "height" in parameters:
+ allow_width_height = True
+ if allow_width_height:
+ kwargs["width"], kwargs["height"] = image.size
+
+ kwargs["negative_prompt"] = negative_prompt
self._filter_kwargs(kwargs)
return self._call_model(
image=image,
prompt=prompt,
- negative_prompt=negative_prompt,
num_images_per_prompt=n,
response_format=response_format,
model=model,
@@ -463,11 +476,12 @@ def inpainting(
# calculate actual image size after padding
width, height = image.size
+ kwargs["negative_prompt"] = negative_prompt
+ self._filter_kwargs(kwargs)
return self._call_model(
image=image,
mask_image=mask_image,
prompt=prompt,
- negative_prompt=negative_prompt,
height=height,
width=width,
num_images_per_prompt=n,
From b7c70229886ab06f4e8d0d58ddeb91093f7801bd Mon Sep 17 00:00:00 2001
From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
Date: Fri, 13 Sep 2024 22:22:37 +0800
Subject: [PATCH 06/17] FEAT: Support yi-coder-chat (#2302)
Co-authored-by: JunHowie
---
xinference/model/llm/llm_family.json | 77 ++++++++++++++++++
.../model/llm/llm_family_modelscope.json | 81 +++++++++++++++++++
xinference/model/llm/vllm/core.py | 2 +
3 files changed, 160 insertions(+)
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index e997098e65..1dfeca1fb4 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -7093,5 +7093,82 @@
"stop": [
"<|end▁of▁sentence|>"
]
+ },
+ {
+ "version": 1,
+ "context_length": 131072,
+ "model_name": "yi-coder-chat",
+ "model_lang": [
+ "en"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 9,
+ "quantizations": [
+ "none"
+ ],
+ "model_id": "01ai/Yi-Coder-9B-Chat",
+ "model_revision": "356a1f8d4e4a606d0b879e54191ca809918576b8"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "none"
+ ],
+ "model_id": "01ai/Yi-Coder-1.5B-Chat",
+ "model_revision": "92fdd1b2f1539ac990e7f4a921db5601da2f0299"
+ }
+ ],
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+ "stop_token_ids": [
+ 1,
+ 2,
+ 6,
+ 7
+ ],
+ "stop": [
+ "<|startoftext|>",
+ "<|endoftext|>",
+ "<|im_start|>",
+ "<|im_end|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 131072,
+ "model_name": "yi-coder",
+ "model_lang": [
+ "en"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 9,
+ "quantizations": [
+ "none"
+ ],
+ "model_id": "01-ai/Yi-Coder-9B",
+ "model_revision": "e20f8087a9507ac8bce409dc5db5d0c608124238"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "none"
+ ],
+ "model_id": "01-ai/Yi-Coder-1.5B",
+ "model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109"
+ }
+ ]
}
]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index f4386e85fa..b7b0da1b13 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4808,5 +4808,86 @@
"stop": [
"<|end▁of▁sentence|>"
]
+ },
+ {
+ "version": 1,
+ "context_length": 131072,
+ "model_name": "yi-coder-chat",
+ "model_lang": [
+ "en"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 9,
+ "quantizations": [
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "01ai/Yi-Coder-9B-Chat",
+ "model_revision": "master"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "01ai/Yi-Coder-1.5B-Chat",
+ "model_revision": "master"
+ }
+ ],
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+ "stop_token_ids": [
+ 1,
+ 2,
+ 6,
+ 7
+ ],
+ "stop": [
+ "<|startoftext|>",
+ "<|endoftext|>",
+ "<|im_start|>",
+ "<|im_end|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 131072,
+ "model_name": "yi-coder",
+ "model_lang": [
+ "en"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 9,
+ "quantizations": [
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "01ai/Yi-Coder-9B",
+ "model_revision": "master"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "01ai/Yi-Coder-1.5B",
+ "model_revision": "master"
+ }
+ ]
}
]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index e531769a18..811fd5d342 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -104,6 +104,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
"code-llama-python",
"deepseek",
"deepseek-coder",
+ "yi-coder",
]
VLLM_SUPPORTED_CHAT_MODELS = [
"llama-2-chat",
@@ -130,6 +131,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
"codegeex4",
"deepseek-chat",
"deepseek-coder-instruct",
+ "yi-coder-chat",
]
if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
From 26666356f80f61d21cca80389f7fc47ea9c1caa7 Mon Sep 17 00:00:00 2001
From: Xuye Qin
Date: Sat, 14 Sep 2024 12:27:04 +0800
Subject: [PATCH 07/17] BUG: fix sampler_name for img2img (#2301)
---
xinference/model/image/sdapi.py | 2 +-
.../model/image/stable_diffusion/core.py | 69 +++++++++----------
2 files changed, 32 insertions(+), 39 deletions(-)
diff --git a/xinference/model/image/sdapi.py b/xinference/model/image/sdapi.py
index b3af166299..6ef21d48ab 100644
--- a/xinference/model/image/sdapi.py
+++ b/xinference/model/image/sdapi.py
@@ -30,7 +30,7 @@ class SDAPIToDiffusersConverter:
txt2img_arg_mapping = {
"steps": "num_inference_steps",
"cfg_scale": "guidance_scale",
- "denoising_strength": "strength",
+ # "denoising_strength": "strength",
}
img2img_identical_args = {
"prompt",
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index eed9739b2c..5a7e99fe33 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -24,7 +24,7 @@
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from io import BytesIO
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
import PIL.Image
import torch
@@ -168,7 +168,9 @@ def load(self):
self._kwargs[text_encoder_name] = text_encoder
self._kwargs["device_map"] = "balanced"
- logger.debug("Loading model %s", AutoPipelineModel)
+ logger.debug(
+ "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
+ )
self._model = AutoPipelineModel.from_pretrained(
self._model_path,
**self._kwargs,
@@ -183,11 +185,12 @@ def load(self):
self._model.enable_attention_slicing()
self._apply_lora()
- def _get_scheduler(self, sampler_name: str):
+ @staticmethod
+ def _get_scheduler(model: Any, sampler_name: str):
if not sampler_name:
return
- assert self._model is not None
+ assert model is not None
import diffusers
@@ -195,80 +198,73 @@ def _get_scheduler(self, sampler_name: str):
# to get A1111 <> Diffusers Scheduler mapping
if sampler_name == "DPM++ 2M":
return diffusers.DPMSolverMultistepScheduler.from_config(
- self._model.scheduler.config
+ model.scheduler.config
)
elif sampler_name == "DPM++ 2M Karras":
return diffusers.DPMSolverMultistepScheduler.from_config(
- self._model.scheduler.config, use_karras_sigmas=True
+ model.scheduler.config, use_karras_sigmas=True
)
elif sampler_name == "DPM++ 2M SDE":
return diffusers.DPMSolverMultistepScheduler.from_config(
- self._model.scheduler.config, algorithm_type="sde-dpmsolver++"
+ model.scheduler.config, algorithm_type="sde-dpmsolver++"
)
elif sampler_name == "DPM++ 2M SDE Karras":
return diffusers.DPMSolverMultistepScheduler.from_config(
- self._model.scheduler.config,
+ model.scheduler.config,
algorithm_type="sde-dpmsolver++",
use_karras_sigmas=True,
)
elif sampler_name == "DPM++ SDE":
return diffusers.DPMSolverSinglestepScheduler.from_config(
- self._model.scheduler.config
+ model.scheduler.config
)
elif sampler_name == "DPM++ SDE Karras":
return diffusers.DPMSolverSinglestepScheduler.from_config(
- self._model.scheduler.config, use_karras_sigmas=True
+ model.scheduler.config, use_karras_sigmas=True
)
elif sampler_name == "DPM2":
- return diffusers.KDPM2DiscreteScheduler.from_config(
- self._model.scheduler.config
- )
+ return diffusers.KDPM2DiscreteScheduler.from_config(model.scheduler.config)
elif sampler_name == "DPM2 Karras":
return diffusers.KDPM2DiscreteScheduler.from_config(
- self._model.scheduler.config, use_karras_sigmas=True
+ model.scheduler.config, use_karras_sigmas=True
)
elif sampler_name == "DPM2 a":
return diffusers.KDPM2AncestralDiscreteScheduler.from_config(
- self._model.scheduler.config
+ model.scheduler.config
)
elif sampler_name == "DPM2 a Karras":
return diffusers.KDPM2AncestralDiscreteScheduler.from_config(
- self._model.scheduler.config, use_karras_sigmas=True
+ model.scheduler.config, use_karras_sigmas=True
)
elif sampler_name == "Euler":
- return diffusers.EulerDiscreteScheduler.from_config(
- self._model.scheduler.config
- )
+ return diffusers.EulerDiscreteScheduler.from_config(model.scheduler.config)
elif sampler_name == "Euler a":
return diffusers.EulerAncestralDiscreteScheduler.from_config(
- self._model.scheduler.config
+ model.scheduler.config
)
elif sampler_name == "Heun":
- return diffusers.HeunDiscreteScheduler.from_config(
- self._model.scheduler.config
- )
+ return diffusers.HeunDiscreteScheduler.from_config(model.scheduler.config)
elif sampler_name == "LMS":
- return diffusers.LMSDiscreteScheduler.from_config(
- self._model.scheduler.config
- )
+ return diffusers.LMSDiscreteScheduler.from_config(model.scheduler.config)
elif sampler_name == "LMS Karras":
return diffusers.LMSDiscreteScheduler.from_config(
- self._model.scheduler.config, use_karras_sigmas=True
+ model.scheduler.config, use_karras_sigmas=True
)
else:
raise ValueError(f"Unknown sampler: {sampler_name}")
+ @staticmethod
@contextlib.contextmanager
- def _reset_when_done(self, sampler_name: str):
- assert self._model is not None
- scheduler = self._get_scheduler(sampler_name)
+ def _reset_when_done(model: Any, sampler_name: str):
+ assert model is not None
+ scheduler = DiffusionModel._get_scheduler(model, sampler_name)
if scheduler:
- default_scheduler = self._model.scheduler
- self._model.scheduler = scheduler
+ default_scheduler = model.scheduler
+ model.scheduler = scheduler
try:
yield
finally:
- self._model.scheduler = default_scheduler
+ model.scheduler = default_scheduler
else:
yield
@@ -292,11 +288,8 @@ def _call_model(
kwargs["generator"] = generator.manual_seed(seed)
sampler_name = kwargs.pop("sampler_name", None)
assert callable(model)
- with self._reset_when_done(sampler_name):
- logger.debug(
- "stable diffusion args: %s",
- kwargs,
- )
+ with self._reset_when_done(model, sampler_name):
+ logger.debug("stable diffusion args: %s, model: %s", kwargs, model)
images = model(**kwargs).images
# revert padding if padded
From 961d355102007e3cd7963a353105b2422a31d4fd Mon Sep 17 00:00:00 2001
From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com>
Date: Sat, 14 Sep 2024 07:22:13 +0200
Subject: [PATCH 08/17] FEAT: qwen2 audio (#2271)
---
xinference/core/tests/test_restful_api.py | 74 ++++++++
xinference/model/llm/__init__.py | 2 +
xinference/model/llm/llm_family.json | 74 ++++++++
xinference/model/llm/llm_family.py | 4 +-
.../model/llm/llm_family_modelscope.json | 68 +++++++
xinference/model/llm/transformers/core.py | 2 +
.../model/llm/transformers/qwen2_audio.py | 168 ++++++++++++++++++
7 files changed, 391 insertions(+), 1 deletion(-)
create mode 100644 xinference/model/llm/transformers/qwen2_audio.py
diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
index 0c50eb256d..af22ca7a8b 100644
--- a/xinference/core/tests/test_restful_api.py
+++ b/xinference/core/tests/test_restful_api.py
@@ -1240,3 +1240,77 @@ def test_launch_model_by_version(setup):
# delete again
url = f"{endpoint}/v1/models/test_qwen15"
requests.delete(url)
+
+
+@pytest.mark.skip(reason="Cost too many resources.")
+def test_restful_api_for_qwen_audio(setup):
+ model_name = "qwen2-audio-instruct"
+
+ endpoint, _ = setup
+ url = f"{endpoint}/v1/models"
+
+ # list
+ response = requests.get(url)
+ response_data = response.json()
+ assert len(response_data["data"]) == 0
+
+ # launch
+ payload = {
+ "model_uid": "test_audio",
+ "model_name": model_name,
+ "model_engine": "transformers",
+ "model_size_in_billions": 7,
+ "model_format": "pytorch",
+ "quantization": "none",
+ }
+
+ response = requests.post(url, json=payload)
+ response_data = response.json()
+ model_uid_res = response_data["model_uid"]
+ assert model_uid_res == "test_audio"
+
+ response = requests.get(url)
+ response_data = response.json()
+ assert len(response_data["data"]) == 1
+
+ url = f"{endpoint}/v1/chat/completions"
+ payload = {
+ "model": model_uid_res,
+ "messages": [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "audio",
+ "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+ },
+ {"type": "text", "text": "What's that sound?"},
+ ],
+ },
+ {"role": "assistant", "content": "It is the sound of glass shattering."},
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What can you do when you hear that?"},
+ ],
+ },
+ {
+ "role": "assistant",
+ "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.",
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "audio",
+ "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac",
+ },
+ {"type": "text", "text": "What does the person say?"},
+ ],
+ },
+ ],
+ }
+ response = requests.post(url, json=payload)
+ completion = response.json()
+ assert len(completion["choices"][0]["message"]) > 0
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index 5a7895eb1a..f971e65661 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -146,6 +146,7 @@ def _install():
from .transformers.internlm2 import Internlm2PytorchChatModel
from .transformers.minicpmv25 import MiniCPMV25Model
from .transformers.minicpmv26 import MiniCPMV26Model
+ from .transformers.qwen2_audio import Qwen2AudioChatModel
from .transformers.qwen2_vl import Qwen2VLChatModel
from .transformers.qwen_vl import QwenVLChatModel
from .transformers.yi_vl import YiVLChatModel
@@ -177,6 +178,7 @@ def _install():
Internlm2PytorchChatModel,
QwenVLChatModel,
Qwen2VLChatModel,
+ Qwen2AudioChatModel,
YiVLChatModel,
DeepSeekVLChatModel,
InternVLChatModel,
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 1dfeca1fb4..77dda1a84d 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -6947,6 +6947,80 @@
""
]
},
+ {
+ "version":1,
+ "context_length":32768,
+ "model_name":"qwen2-audio-instruct",
+ "model_lang":[
+ "en",
+ "zh"
+ ],
+ "model_ability":[
+ "chat",
+ "audio"
+ ],
+ "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+ "model_specs":[
+ {
+ "model_format":"pytorch",
+ "model_size_in_billions":7,
+ "quantizations":[
+ "none"
+ ],
+ "model_id":"Qwen/Qwen2-Audio-7B-Instruct",
+ "model_revision":"bac62d2c6808845904c709c17a0402d817558c64"
+ }
+ ],
+ "prompt_style":{
+ "style_name":"QWEN",
+ "system_prompt":"You are a helpful assistant",
+ "roles":[
+ "user",
+ "assistant"
+ ],
+ "stop": [
+ "<|im_end|>",
+ "<|endoftext|>"
+ ]
+ }
+ },
+ {
+ "version":1,
+ "context_length":32768,
+ "model_name":"qwen2-audio",
+ "model_lang":[
+ "en",
+ "zh"
+ ],
+ "model_ability":[
+ "chat",
+ "audio"
+ ],
+ "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+ "model_specs":[
+ {
+ "model_format":"pytorch",
+ "model_size_in_billions":7,
+ "quantizations":[
+ "none"
+ ],
+ "model_id":"Qwen/Qwen2-Audio-7B",
+ "model_revision":"8577bc71d330c8fa32ffe9f8a1374100759f2466"
+ }
+ ],
+ "prompt_style":{
+ "style_name":"QWEN",
+ "system_prompt":"You are a helpful assistant",
+ "roles":[
+ "user",
+ "assistant"
+ ],
+ "stop": [
+ "<|im_end|>",
+ "<|endoftext|>"
+ ]
+ }
+ },
{
"version": 1,
"context_length": 128000,
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index 555921f18f..413b4229ae 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -132,7 +132,9 @@ class LLMFamilyV1(BaseModel):
context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
model_name: str
model_lang: List[str]
- model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]]
+ model_ability: List[
+ Literal["embed", "generate", "chat", "tools", "vision", "audio"]
+ ]
model_description: Optional[str]
# reason for not required str here: legacy registration
model_family: Optional[str]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index b7b0da1b13..fdaab458aa 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4656,6 +4656,74 @@
""
]
},
+ {
+ "version": 1,
+ "context_length": 32768,
+ "model_name": "qwen2-audio-instruct",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat",
+ "audio"
+ ],
+ "model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "qwen/Qwen2-Audio-7B-Instruct",
+ "model_revision": "master"
+ }
+ ],
+ "prompt_style": {
+ "style_name": "QWEN",
+ "system_prompt": "You are a helpful assistant",
+ "roles": [
+ "user",
+ "assistant"
+ ]
+ }
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
+ "model_name": "qwen2-audio",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat",
+ "audio"
+ ],
+ "model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "qwen/Qwen2-Audio-7B",
+ "model_revision": "master"
+ }
+ ],
+ "prompt_style": {
+ "style_name": "QWEN",
+ "system_prompt": "You are a helpful assistant",
+ "roles": [
+ "user",
+ "assistant"
+ ]
+ }
+ },
{
"version": 1,
"context_length": 128000,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index a451b7accd..e42ca6d513 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -65,6 +65,8 @@
"MiniCPM-V-2.6",
"glm-4v",
"qwen2-vl-instruct",
+ "qwen2-audio",
+ "qwen2-audio-instruct",
"deepseek-v2",
"deepseek-v2-chat",
"deepseek-v2.5",
diff --git a/xinference/model/llm/transformers/qwen2_audio.py b/xinference/model/llm/transformers/qwen2_audio.py
new file mode 100644
index 0000000000..653f7217f8
--- /dev/null
+++ b/xinference/model/llm/transformers/qwen2_audio.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import uuid
+from io import BytesIO
+from typing import Dict, Iterator, List, Optional, Union
+from urllib.request import urlopen
+
+import numpy as np
+
+from ....model.utils import select_device
+from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, generate_completion_chunk
+from .core import PytorchChatModel, PytorchGenerateConfig
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2AudioChatModel(PytorchChatModel):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._processor = None
+ self._model = None
+ self._device = None
+
+ @classmethod
+ def match(
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+ ) -> bool:
+ llm_family = model_family.model_family or model_family.model_name
+ if "qwen2-audio".lower() in llm_family.lower():
+ return True
+ return False
+
+ def load(self):
+ from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+
+ device = self._pytorch_model_config.get("device", "auto")
+ device = select_device(device)
+ self._device = device
+ # for multiple GPU, set back to auto to make multiple devices work
+ device = "auto" if device == "cuda" else device
+
+ self._processor = AutoProcessor.from_pretrained(
+ self.model_path,
+ device_map=device,
+ # trust_remote_code=True,
+ code_revision=self.model_spec.model_revision,
+ )
+ self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
+ self.model_path,
+ device_map=device,
+ # trust_remote_code=True,
+ revision=self.model_spec.model_revision,
+ )
+
+ def _transform_messages(
+ self,
+ messages: List[Dict],
+ ):
+ import librosa
+
+ text = self._processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=False
+ )
+ audios: List[np.ndarray] = []
+ for msg in messages:
+ content = msg["content"]
+ if isinstance(content, List):
+ for item in content: # type: ignore
+ if item.get("type") == "audio" and "audio_url" in item:
+ audio = librosa.load(
+ BytesIO(urlopen(item["audio_url"]).read()),
+ sr=self._processor.feature_extractor.sampling_rate,
+ )[0]
+ audios.append(audio)
+
+ return text, audios
+
+ def chat(
+ self,
+ messages: List[Dict],
+ generate_config: Optional[PytorchGenerateConfig] = None,
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+ text, audios = self._transform_messages(messages)
+ inputs = self._processor(
+ text=text, audios=audios, return_tensors="pt", padding=True
+ )
+ inputs.input_ids = inputs.input_ids.to(self._device)
+ generate_config = generate_config if generate_config else {}
+ stream = generate_config.get("stream", False) if generate_config else False
+
+ if stream:
+ it = self._generate_stream(inputs, generate_config)
+ return self._to_chat_completion_chunks(it)
+ else:
+ c = self._generate(inputs, generate_config)
+ return c
+
+ def _generate(self, inputs, config: PytorchGenerateConfig = {}) -> ChatCompletion:
+ generate_ids = self._model.generate(
+ **inputs,
+ max_length=config.get("max_tokens", 512),
+ )
+ generate_ids = generate_ids[:, inputs.input_ids.size(1) :]
+ response = self._processor.batch_decode(
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+ )[0]
+ return generate_chat_completion(self.model_uid, response)
+
+ def _generate_stream(
+ self, inputs, config: PytorchGenerateConfig = {}
+ ) -> Iterator[CompletionChunk]:
+ from threading import Thread
+
+ from transformers import TextIteratorStreamer
+
+ tokenizer = self._processor.tokenizer
+ streamer = TextIteratorStreamer(
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+ )
+
+ gen_kwargs = {
+ "max_new_tokens": config.get("max_tokens", 512),
+ "streamer": streamer,
+ **inputs,
+ }
+
+ thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
+ thread.start()
+
+ completion_id = str(uuid.uuid1())
+ for new_text in streamer:
+ yield generate_completion_chunk(
+ chunk_text=new_text,
+ finish_reason=None,
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=-1,
+ completion_tokens=-1,
+ total_tokens=-1,
+ has_choice=True,
+ has_content=True,
+ )
+
+ yield generate_completion_chunk(
+ chunk_text=None,
+ finish_reason="stop",
+ chunk_id=completion_id,
+ model_uid=self.model_uid,
+ prompt_tokens=-1,
+ completion_tokens=-1,
+ total_tokens=-1,
+ has_choice=True,
+ has_content=False,
+ )
From 4aa58615ae4fd4dc3313411f6b485274f7d31c18 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Sat, 14 Sep 2024 15:23:21 +0800
Subject: [PATCH 09/17] BUG: modify vllm image version (#2311)
---
xinference/deploy/docker/Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
index 5ee3f11771..810a440ecd 100644
--- a/xinference/deploy/docker/Dockerfile
+++ b/xinference/deploy/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:latest
+FROM vllm/vllm-openai:0.6.0
COPY . /opt/inference
WORKDIR /opt/inference
From 4c5e752920fba416432cad7debd3722a75c3e8a2 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Sat, 14 Sep 2024 15:37:10 +0800
Subject: [PATCH 10/17] Bug: modify vllm image version (#2312)
Co-authored-by: wuzhaoxin <15667065080@162.com>
---
xinference/deploy/docker/Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
index 810a440ecd..3d6afc44c3 100644
--- a/xinference/deploy/docker/Dockerfile
+++ b/xinference/deploy/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:0.6.0
+FROM vllm/vllm-openai:v0.6.0
COPY . /opt/inference
WORKDIR /opt/inference
From 91c0fe85cd153158780e717d41bb3fd8036e43ff Mon Sep 17 00:00:00 2001
From: yiboyasss <143868051+yiboyasss@users.noreply.github.com>
Date: Sun, 15 Sep 2024 17:19:21 +0800
Subject: [PATCH 11/17] BUG: [UI] Fix registration page bug. (#2315)
---
xinference/web/ui/src/scenes/register_model/registerModel.js | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js
index ca02e781b4..f35196b3b5 100644
--- a/xinference/web/ui/src/scenes/register_model/registerModel.js
+++ b/xinference/web/ui/src/scenes/register_model/registerModel.js
@@ -686,12 +686,12 @@ const RegisterModelComponent = ({ modelType, customData }) => {
const handleFamilyAlert = () => {
if (
- formData.model_ability.includes('vision') &&
+ formData.model_ability?.includes('vision') &&
!family?.vision?.includes(formData.model_family)
) {
return true
} else if (
- formData.model_ability.includes('tools') &&
+ formData.model_ability?.includes('tools') &&
!family?.tools?.includes(formData.model_family)
) {
return true
From 065686edc64e0af418ca0071f2f5d541dcdbe440 Mon Sep 17 00:00:00 2001
From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com>
Date: Wed, 18 Sep 2024 11:54:00 +0200
Subject: [PATCH 12/17] BUG: Fix CosyVoice missing output (#2320)
---
xinference/model/audio/cosyvoice.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/xinference/model/audio/cosyvoice.py b/xinference/model/audio/cosyvoice.py
index 39bcb7aa6c..9be452f473 100644
--- a/xinference/model/audio/cosyvoice.py
+++ b/xinference/model/audio/cosyvoice.py
@@ -122,10 +122,10 @@ def _generator_stream():
last_pos = new_last_pos
def _generator_block():
- chunk = next(output)
- assert isinstance(chunk, dict), "Expected data to be of type dict"
+ chunks = [o["tts_speech"] for o in output]
+ t = torch.cat(chunks, dim=1)
with BytesIO() as out:
- torchaudio.save(out, chunk["tts_speech"], 22050, format=response_format)
+ torchaudio.save(out, t, 22050, format=response_format)
return out.getvalue()
return _generator_stream() if stream else _generator_block()
From a461ad926fa088d567cd7c96a6aba3468b0a0779 Mon Sep 17 00:00:00 2001
From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
Date: Thu, 19 Sep 2024 05:40:53 +0800
Subject: [PATCH 13/17] FEAT: Support Qwen 2.5 (#2325)
---
xinference/model/llm/llm_family.json | 360 ++++++++++++++++
.../model/llm/llm_family_modelscope.json | 388 ++++++++++++++++++
xinference/model/llm/vllm/core.py | 1 +
3 files changed, 749 insertions(+)
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 77dda1a84d..70b17daa61 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -7244,5 +7244,365 @@
"model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109"
}
]
+ },
+ {
+ "version": 1,
+ "context_length": 131072,
+ "model_name": "qwen2.5-instruct",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat",
+ "tools"
+ ],
+ "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-3B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-7B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-14B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-32B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-72B-Instruct"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct-AWQ"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct-AWQ"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-3B-Instruct-AWQ"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-7B-Instruct-AWQ"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-14B-Instruct-AWQ"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-32B-Instruct-AWQ"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "Qwen/Qwen2.5-72B-Instruct-AWQ"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q5_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q5_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q6_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q8_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "fp16": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ]
+ }
+ }
+ ],
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+ "stop_token_ids": [
+ 151643,
+ 151644,
+ 151645
+ ],
+ "stop": [
+ "<|endoftext|>",
+ "<|im_start|>",
+ "<|im_end|>"
+ ]
}
]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index fdaab458aa..7309ee9651 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4957,5 +4957,393 @@
"model_revision": "master"
}
]
+ },
+ {
+ "version": 1,
+ "context_length": 131072,
+ "model_name": "qwen2.5-instruct",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat",
+ "tools"
+ ],
+ "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-0.5B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-1.5B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-3B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-7B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-14B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-32B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-72B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "gptq",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "Int4",
+ "Int8"
+ ],
+ "model_id": "qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2.5-3B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2.5-7B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions":14,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2.5-14B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2.5-32B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "awq",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "Int4"
+ ],
+ "model_id": "qwen/Qwen2.5-72B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-3B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-14B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-32B-Instruct-GGUF",
+ "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0",
+ "fp16"
+ ],
+ "model_id": "qwen/Qwen2.5-72B-Instruct-GGUF",
+ "model_hub": "modelscope",
+ "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q5_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q5_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q6_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q8_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "fp16": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ]
+ }
+ }
+ ],
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+ "stop_token_ids": [
+ 151643,
+ 151644,
+ 151645
+ ],
+ "stop": [
+ "<|endoftext|>",
+ "<|im_start|>",
+ "<|im_end|>"
+ ]
}
]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 811fd5d342..3aaee0738f 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -138,6 +138,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
From 9820786a56ade6af783c69e96a8f30319b30f1f2 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:32:00 +0800
Subject: [PATCH 14/17] BUG: support old register llm format (#2335)
---
xinference/model/llm/__init__.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index f971e65661..a9f05a9b25 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -121,7 +121,7 @@ def register_custom_model():
with codecs.open(
os.path.join(user_defined_llm_dir, f), encoding="utf-8"
) as fd:
- user_defined_llm_family = CustomLLMFamilyV1.parse_obj(json.load(fd))
+ user_defined_llm_family = CustomLLMFamilyV1.parse_raw(fd.read())
register_llm(user_defined_llm_family, persist=False)
except Exception as e:
warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}")
From 3cc9bc525667e2161ae072d3d892d33b2723b2a9 Mon Sep 17 00:00:00 2001
From: Xuye Qin
Date: Fri, 20 Sep 2024 15:06:33 +0800
Subject: [PATCH 15/17] BUG: fix stable diffusion from dify tool (#2336)
---
xinference/core/model.py | 4 +-
.../model/image/stable_diffusion/core.py | 49 ++++++++++++-------
2 files changed, 32 insertions(+), 21 deletions(-)
diff --git a/xinference/core/model.py b/xinference/core/model.py
index 1f711fb117..2274f422c0 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -769,7 +769,7 @@ async def image_to_image(
self,
image: "PIL.Image",
prompt: str,
- negative_prompt: str,
+ negative_prompt: Optional[str] = None,
n: int = 1,
size: Optional[str] = None,
response_format: str = "url",
@@ -777,12 +777,12 @@ async def image_to_image(
**kwargs,
):
kwargs.pop("request_id", None)
+ kwargs["negative_prompt"] = negative_prompt
if hasattr(self._model, "image_to_image"):
return await self._call_wrapper_json(
self._model.image_to_image,
image,
prompt,
- negative_prompt,
n,
size,
response_format,
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index 5a7e99fe33..53151b2c19 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -21,6 +21,7 @@
import sys
import time
import uuid
+import warnings
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from io import BytesIO
@@ -31,7 +32,7 @@
from PIL import ImageOps
from ....constants import XINFERENCE_IMAGE_DIR
-from ....device_utils import move_model_to_available_device
+from ....device_utils import get_available_device, move_model_to_available_device
from ....types import Image, ImageList, LoRA
from ..sdapi import SDAPIDiffusionModelMixin
@@ -60,6 +61,23 @@
]
+def model_accept_param(params: Union[str, List[str]], model: Any) -> bool:
+ params = [params] if isinstance(params, str) else params
+ # model is diffusers Pipeline
+ parameters = inspect.signature(model.__call__).parameters # type: ignore
+ allow_params = False
+ for param in parameters.values():
+ if param.kind == inspect.Parameter.VAR_KEYWORD:
+ # the __call__ can accept **kwargs,
+ # we treat it as it can accept any parameters
+ allow_params = True
+ break
+ if not allow_params:
+ if all(param in parameters for param in params):
+ allow_params = True
+ return allow_params
+
+
class DiffusionModel(SDAPIDiffusionModelMixin):
def __init__(
self,
@@ -187,7 +205,7 @@ def load(self):
@staticmethod
def _get_scheduler(model: Any, sampler_name: str):
- if not sampler_name:
+ if not sampler_name or sampler_name == "default":
return
assert model is not None
@@ -283,13 +301,14 @@ def _call_model(
origin_size = kwargs.pop("origin_size", None)
seed = kwargs.pop("seed", None)
if seed is not None:
- kwargs["generator"] = generator = torch.Generator(device=self._model.device) # type: ignore
+ kwargs["generator"] = generator = torch.Generator(device=get_available_device()) # type: ignore
if seed != -1:
kwargs["generator"] = generator.manual_seed(seed)
sampler_name = kwargs.pop("sampler_name", None)
assert callable(model)
with self._reset_when_done(model, sampler_name):
logger.debug("stable diffusion args: %s, model: %s", kwargs, model)
+ self._filter_kwargs(model, kwargs)
images = model(**kwargs).images
# revert padding if padded
@@ -328,11 +347,17 @@ def _gen_base64_image(_img):
raise ValueError(f"Unsupported response format: {response_format}")
@classmethod
- def _filter_kwargs(cls, kwargs: dict):
+ def _filter_kwargs(cls, model, kwargs: dict):
for arg in ["negative_prompt", "num_inference_steps"]:
if not kwargs.get(arg):
kwargs.pop(arg, None)
+ for key in list(kwargs):
+ allow_key = model_accept_param(key, model)
+ if not allow_key:
+ warnings.warn(f"{type(model)} cannot accept `{key}`, will ignore it")
+ kwargs.pop(key)
+
def text_to_image(
self,
prompt: str,
@@ -346,7 +371,6 @@ def text_to_image(
width, height = map(int, re.split(r"[^\d]+", size))
generate_kwargs = self._model_spec.default_generate_config.copy() # type: ignore
generate_kwargs.update({k: v for k, v in kwargs.items() if v is not None})
- self._filter_kwargs(generate_kwargs)
return self._call_model(
prompt=prompt,
height=height,
@@ -368,7 +392,6 @@ def image_to_image(
self,
image: PIL.Image,
prompt: Optional[Union[str, List[str]]] = None,
- negative_prompt: Optional[Union[str, List[str]]] = None,
n: int = 1,
size: Optional[str] = None,
response_format: str = "url",
@@ -404,19 +427,10 @@ def image_to_image(
kwargs["height"] = height
else:
# SD3 image2image cannot accept width and height
- parameters = inspect.signature(model.__call__).parameters # type: ignore
- allow_width_height = False
- for param in parameters.values():
- if param.kind == inspect.Parameter.VAR_KEYWORD:
- allow_width_height = True
- break
- if "width" in parameters or "height" in parameters:
- allow_width_height = True
+ allow_width_height = model_accept_param(["width", "height"], model)
if allow_width_height:
kwargs["width"], kwargs["height"] = image.size
- kwargs["negative_prompt"] = negative_prompt
- self._filter_kwargs(kwargs)
return self._call_model(
image=image,
prompt=prompt,
@@ -431,7 +445,6 @@ def inpainting(
image: PIL.Image,
mask_image: PIL.Image,
prompt: Optional[Union[str, List[str]]] = None,
- negative_prompt: Optional[Union[str, List[str]]] = None,
n: int = 1,
size: str = "1024*1024",
response_format: str = "url",
@@ -469,8 +482,6 @@ def inpainting(
# calculate actual image size after padding
width, height = image.size
- kwargs["negative_prompt"] = negative_prompt
- self._filter_kwargs(kwargs)
return self._call_model(
image=image,
mask_image=mask_image,
From 67bd4db700b060948a3b77d6f53920b90c947a6d Mon Sep 17 00:00:00 2001
From: Xuye Qin
Date: Fri, 20 Sep 2024 16:11:16 +0800
Subject: [PATCH 16/17] DOC: update models for doc and readme (#2330)
---
README.md | 8 +-
README_zh_CN.md | 8 +-
doc/source/getting_started/installation.rst | 4 +-
.../builtin/audio/fishspeech-1.2-sft.rst | 19 -
.../models/builtin/audio/fishspeech-1.4.rst | 19 +
doc/source/models/builtin/audio/index.rst | 2 +-
.../models/builtin/image/flux.1-dev.rst | 2 +-
.../models/builtin/image/flux.1-schnell.rst | 2 +-
.../builtin/llm/deepseek-v2-chat-0628.rst | 31 ++
.../models/builtin/llm/deepseek-v2-chat.rst | 47 ++
.../models/builtin/llm/deepseek-v2.5.rst | 31 ++
doc/source/models/builtin/llm/deepseek-v2.rst | 47 ++
doc/source/models/builtin/llm/index.rst | 63 +++
.../builtin/llm/qwen2-audio-instruct.rst | 31 ++
doc/source/models/builtin/llm/qwen2-audio.rst | 31 ++
.../models/builtin/llm/qwen2.5-instruct.rst | 463 ++++++++++++++++++
.../models/builtin/llm/yi-coder-chat.rst | 47 ++
doc/source/models/builtin/llm/yi-coder.rst | 47 ++
doc/source/user_guide/backends.rst | 4 +-
19 files changed, 874 insertions(+), 32 deletions(-)
delete mode 100644 doc/source/models/builtin/audio/fishspeech-1.2-sft.rst
create mode 100644 doc/source/models/builtin/audio/fishspeech-1.4.rst
create mode 100644 doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
create mode 100644 doc/source/models/builtin/llm/deepseek-v2-chat.rst
create mode 100644 doc/source/models/builtin/llm/deepseek-v2.5.rst
create mode 100644 doc/source/models/builtin/llm/deepseek-v2.rst
create mode 100644 doc/source/models/builtin/llm/qwen2-audio-instruct.rst
create mode 100644 doc/source/models/builtin/llm/qwen2-audio.rst
create mode 100644 doc/source/models/builtin/llm/qwen2.5-instruct.rst
create mode 100644 doc/source/models/builtin/llm/yi-coder-chat.rst
create mode 100644 doc/source/models/builtin/llm/yi-coder.rst
diff --git a/README.md b/README.md
index 576dff498e..f478bfd37b 100644
--- a/README.md
+++ b/README.md
@@ -34,14 +34,14 @@ potential of cutting-edge AI models.
- Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929)
- Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906)
### New Models
+- Built-in support for [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325)
+- Built-in support for [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295)
+- Built-in support for [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292)
+- Built-in support for [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271)
- Built-in support for [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205)
- Built-in support for [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263)
- Built-in support for [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
- Built-in support for [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
-- Built-in support for [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
-- Built-in support for [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
-- Built-in support for [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
-- Built-in support for [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
### Integrations
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
- [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
diff --git a/README_zh_CN.md b/README_zh_CN.md
index 08a1f80b27..cd155e3997 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -31,14 +31,14 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布
- 支持语音识别模型: [#929](https://github.com/xorbitsai/inference/pull/929)
- 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906)
### 新模型
+- 内置 [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325)
+- 内置 [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295)
+- 内置 [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292)
+- 内置 [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271)
- 内置 [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205)
- 内置 [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263)
- 内置 [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
- 内置 [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
-- 内置 [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
-- 内置 [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
-- 内置 [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
-- 内置 [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
### 集成
- [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/):一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力,帮助您轻松实现复杂的问答场景。
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
index e52384bee7..8490f93439 100644
--- a/doc/source/getting_started/installation.rst
+++ b/doc/source/getting_started/installation.rst
@@ -44,7 +44,8 @@ Currently, supported models include:
- ``codestral-v0.1``
- ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
- ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``yi-coder``, ``yi-coder-chat``
- ``codeqwen1.5``, ``codeqwen1.5-chat``
- ``baichuan-2-chat``
- ``internlm2-chat``
@@ -56,6 +57,7 @@ Currently, supported models include:
- ``codegeex4``
- ``qwen1.5-chat``, ``qwen1.5-moe-chat``
- ``qwen2-instruct``, ``qwen2-moe-instruct``
+- ``qwen2.5-instruct``
- ``gemma-it``, ``gemma-2-it``
- ``orion-chat``, ``orion-chat-rag``
- ``c4ai-command-r-v01``
diff --git a/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst b/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst
deleted file mode 100644
index 3afac1f7e3..0000000000
--- a/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. _models_builtin_fishspeech-1.2-sft:
-
-==================
-FishSpeech-1.2-SFT
-==================
-
-- **Model Name:** FishSpeech-1.2-SFT
-- **Model Family:** FishAudio
-- **Abilities:** text-to-audio
-- **Multilingual:** True
-
-Specifications
-^^^^^^^^^^^^^^
-
-- **Model ID:** fishaudio/fish-speech-1.2-sft
-
-Execute the following command to launch the model::
-
- xinference launch --model-name FishSpeech-1.2-SFT --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/fishspeech-1.4.rst b/doc/source/models/builtin/audio/fishspeech-1.4.rst
new file mode 100644
index 0000000000..c256495d67
--- /dev/null
+++ b/doc/source/models/builtin/audio/fishspeech-1.4.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_fishspeech-1.4:
+
+==============
+FishSpeech-1.4
+==============
+
+- **Model Name:** FishSpeech-1.4
+- **Model Family:** FishAudio
+- **Abilities:** text-to-audio
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** fishaudio/fish-speech-1.4
+
+Execute the following command to launch the model::
+
+ xinference launch --model-name FishSpeech-1.4 --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/index.rst b/doc/source/models/builtin/audio/index.rst
index 8959b2b94f..d4b6b886ac 100644
--- a/doc/source/models/builtin/audio/index.rst
+++ b/doc/source/models/builtin/audio/index.rst
@@ -25,7 +25,7 @@ The following is a list of built-in audio models in Xinference:
cosyvoice-300m-sft
- fishspeech-1.2-sft
+ fishspeech-1.4
sensevoicesmall
diff --git a/doc/source/models/builtin/image/flux.1-dev.rst b/doc/source/models/builtin/image/flux.1-dev.rst
index 829bcbfd75..3a16cfe0a7 100644
--- a/doc/source/models/builtin/image/flux.1-dev.rst
+++ b/doc/source/models/builtin/image/flux.1-dev.rst
@@ -6,7 +6,7 @@ FLUX.1-dev
- **Model Name:** FLUX.1-dev
- **Model Family:** stable_diffusion
-- **Abilities:** text2image
+- **Abilities:** text2image, image2image, inpainting
- **Available ControlNet:** None
Specifications
diff --git a/doc/source/models/builtin/image/flux.1-schnell.rst b/doc/source/models/builtin/image/flux.1-schnell.rst
index 268f5a1720..df82d2069f 100644
--- a/doc/source/models/builtin/image/flux.1-schnell.rst
+++ b/doc/source/models/builtin/image/flux.1-schnell.rst
@@ -6,7 +6,7 @@ FLUX.1-schnell
- **Model Name:** FLUX.1-schnell
- **Model Family:** stable_diffusion
-- **Abilities:** text2image
+- **Abilities:** text2image, image2image, inpainting
- **Available ControlNet:** None
Specifications
diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
new file mode 100644
index 0000000000..d6e91cb248
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
@@ -0,0 +1,31 @@
+.. _models_llm_deepseek-v2-chat-0628:
+
+========================================
+deepseek-v2-chat-0628
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2-chat-0628
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Chat-0628
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2-chat-0628 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat.rst b/doc/source/models/builtin/llm/deepseek-v2-chat.rst
new file mode 100644
index 0000000000..84595c2bbb
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2-chat.rst
@@ -0,0 +1,47 @@
+.. _models_llm_deepseek-v2-chat:
+
+========================================
+deepseek-v2-chat
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 16
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Lite-Chat
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 16 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Chat
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2.5.rst b/doc/source/models/builtin/llm/deepseek-v2.5.rst
new file mode 100644
index 0000000000..5f5b9475d4
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2.5.rst
@@ -0,0 +1,31 @@
+.. _models_llm_deepseek-v2.5:
+
+========================================
+deepseek-v2.5
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2.5
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2.5
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2.5 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2.rst b/doc/source/models/builtin/llm/deepseek-v2.rst
new file mode 100644
index 0000000000..4102b9568c
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2.rst
@@ -0,0 +1,47 @@
+.. _models_llm_deepseek-v2:
+
+========================================
+deepseek-v2
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 16
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: Transformers
+- **Model ID:** deepseek-ai/DeepSeek-V2-Lite
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 16 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: Transformers
+- **Model ID:** deepseek-ai/DeepSeek-V2
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index bab4b1093d..73bd2b9894 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -126,6 +126,26 @@ The following is a list of built-in LLM in Xinference:
- 16384
- deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data.
+ * - :ref:`deepseek-v2 `
+ - generate
+ - 128000
+ - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+ * - :ref:`deepseek-v2-chat `
+ - chat
+ - 128000
+ - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+ * - :ref:`deepseek-v2-chat-0628 `
+ - chat
+ - 128000
+ - DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat.
+
+ * - :ref:`deepseek-v2.5 `
+ - chat
+ - 128000
+ - DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.
+
* - :ref:`deepseek-vl-chat `
- chat, vision
- 4096
@@ -371,6 +391,16 @@ The following is a list of built-in LLM in Xinference:
- 32768
- Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.
+ * - :ref:`qwen2-audio `
+ - chat, audio
+ - 32768
+ - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+ * - :ref:`qwen2-audio-instruct `
+ - chat, audio
+ - 32768
+ - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
* - :ref:`qwen2-instruct `
- chat, tools
- 32768
@@ -386,6 +416,11 @@ The following is a list of built-in LLM in Xinference:
- 32768
- Qwen2-VL: To See the World More Clearly.Qwen2-VL is the latest version of the vision language models in the Qwen model familities.
+ * - :ref:`qwen2.5-instruct `
+ - chat, tools
+ - 131072
+ - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
* - :ref:`seallm_v2 `
- generate
- 8192
@@ -471,6 +506,16 @@ The following is a list of built-in LLM in Xinference:
- 4096
- The Yi series models are large language models trained from scratch by developers at 01.AI.
+ * - :ref:`yi-coder `
+ - generate
+ - 131072
+ - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+ * - :ref:`yi-coder-chat `
+ - chat
+ - 131072
+ - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
* - :ref:`yi-vl-chat `
- chat, vision
- 4096
@@ -525,6 +570,14 @@ The following is a list of built-in LLM in Xinference:
deepseek-coder-instruct
+ deepseek-v2
+
+ deepseek-v2-chat
+
+ deepseek-v2-chat-0628
+
+ deepseek-v2.5
+
deepseek-vl-chat
gemma-2-it
@@ -623,12 +676,18 @@ The following is a list of built-in LLM in Xinference:
qwen1.5-moe-chat
+ qwen2-audio
+
+ qwen2-audio-instruct
+
qwen2-instruct
qwen2-moe-instruct
qwen2-vl-instruct
+ qwen2.5-instruct
+
seallm_v2
seallm_v2.5
@@ -663,6 +722,10 @@ The following is a list of built-in LLM in Xinference:
yi-chat
+ yi-coder
+
+ yi-coder-chat
+
yi-vl-chat
diff --git a/doc/source/models/builtin/llm/qwen2-audio-instruct.rst b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst
new file mode 100644
index 0000000000..2d126a387e
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst
@@ -0,0 +1,31 @@
+.. _models_llm_qwen2-audio-instruct:
+
+========================================
+qwen2-audio-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-audio-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, audio
+- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-Audio-7B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-audio-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2-audio.rst b/doc/source/models/builtin/llm/qwen2-audio.rst
new file mode 100644
index 0000000000..2973390c44
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-audio.rst
@@ -0,0 +1,31 @@
+.. _models_llm_qwen2-audio:
+
+========================================
+qwen2-audio
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-audio
+- **Languages:** en, zh
+- **Abilities:** chat, audio
+- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-Audio-7B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-audio --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2.5-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-instruct.rst
new file mode 100644
index 0000000000..6e6b4db35e
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2.5-instruct.rst
@@ -0,0 +1,463 @@
+.. _models_llm_qwen2.5-instruct:
+
+========================================
+qwen2.5-instruct
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** qwen2.5-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, tools
+- **Description:** Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 3
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 5 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 6 (pytorch, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 7 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 8 (gptq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 9 (gptq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 10 (gptq, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 3
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 11 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 12 (gptq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 13 (gptq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 14 (gptq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 15 (awq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 16 (awq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 17 (awq, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 3
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format awq --quantization ${quantization}
+
+
+Model Spec 18 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format awq --quantization ${quantization}
+
+
+Model Spec 19 (awq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format awq --quantization ${quantization}
+
+
+Model Spec 20 (awq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format awq --quantization ${quantization}
+
+
+Model Spec 21 (awq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format awq --quantization ${quantization}
+
+
+Model Spec 22 (ggufv2, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 0_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 23 (ggufv2, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 24 (ggufv2, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 3
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 25 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 26 (ggufv2, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 14
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 27 (ggufv2, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 32
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 28 (ggufv2, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 72
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/yi-coder-chat.rst b/doc/source/models/builtin/llm/yi-coder-chat.rst
new file mode 100644
index 0000000000..af4368ae98
--- /dev/null
+++ b/doc/source/models/builtin/llm/yi-coder-chat.rst
@@ -0,0 +1,47 @@
+.. _models_llm_yi-coder-chat:
+
+========================================
+yi-coder-chat
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** yi-coder-chat
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 9 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 9
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01ai/Yi-Coder-9B-Chat
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name yi-coder-chat --size-in-billions 9 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01ai/Yi-Coder-1.5B-Chat
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name yi-coder-chat --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/yi-coder.rst b/doc/source/models/builtin/llm/yi-coder.rst
new file mode 100644
index 0000000000..347a3bc9d1
--- /dev/null
+++ b/doc/source/models/builtin/llm/yi-coder.rst
@@ -0,0 +1,47 @@
+.. _models_llm_yi-coder:
+
+========================================
+yi-coder
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** yi-coder
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 9 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 9
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01-ai/Yi-Coder-9B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name yi-coder --size-in-billions 9 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** 01-ai/Yi-Coder-1.5B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name yi-coder --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
index 57126871e8..2cbb924e03 100644
--- a/doc/source/user_guide/backends.rst
+++ b/doc/source/user_guide/backends.rst
@@ -51,7 +51,8 @@ Currently, supported model includes:
- ``codestral-v0.1``
- ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
- ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``yi-coder``, ``yi-coder-chat``
- ``codeqwen1.5``, ``codeqwen1.5-chat``
- ``baichuan-2-chat``
- ``internlm2-chat``
@@ -63,6 +64,7 @@ Currently, supported model includes:
- ``codegeex4``
- ``qwen1.5-chat``, ``qwen1.5-moe-chat``
- ``qwen2-instruct``, ``qwen2-moe-instruct``
+- ``qwen2.5-instruct``
- ``gemma-it``, ``gemma-2-it``
- ``orion-chat``, ``orion-chat-rag``
- ``c4ai-command-r-v01``
From 5de46e94c23785fa7e17e3e1d00c3afb6cb1c919 Mon Sep 17 00:00:00 2001
From: amumu96 <128140880+amumu96@users.noreply.github.com>
Date: Fri, 20 Sep 2024 16:58:06 +0800
Subject: [PATCH 17/17] FEAT: support qwen2.5-coder-instruct and qwen2.5 sglang
(#2332)
Co-authored-by: wuzhaoxin <15667065080@162.com>
---
xinference/model/llm/llm_family.json | 550 ++++++++++++++++-
.../model/llm/llm_family_modelscope.json | 565 +++++++++++++++++-
xinference/model/llm/sglang/core.py | 4 +
xinference/model/llm/vllm/core.py | 4 +
4 files changed, 1062 insertions(+), 61 deletions(-)
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 70b17daa61..471b4febc3 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -6874,7 +6874,7 @@
"model_id":"Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
"model_revision":"3d152a77eaccfd72d59baedb0b183a1b8fd56e48"
},
- {
+ {
"model_format":"gptq",
"model_size_in_billions":7,
"quantizations":[
@@ -6883,7 +6883,7 @@
"model_id":"Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
"model_revision":"5ab897112fa83b9699826be8753ef9184585c77d"
},
- {
+ {
"model_format":"awq",
"model_size_in_billions":7,
"quantizations":[
@@ -6891,6 +6891,31 @@
],
"model_id":"Qwen/Qwen2-VL-7B-Instruct-AWQ",
"model_revision":"f94216e8b513933bccd567bcd9b7350199f32538"
+ },
+ {
+ "model_format":"pytorch",
+ "model_size_in_billions":72,
+ "quantizations":[
+ "none"
+ ],
+ "model_id":"Qwen/Qwen2-VL-72B-Instruct"
+ },
+ {
+ "model_format":"awq",
+ "model_size_in_billions":72,
+ "quantizations":[
+ "Int4"
+ ],
+ "model_id":"Qwen/Qwen2-VL-72B-Instruct-AWQ"
+ },
+ {
+ "model_format":"gptq",
+ "model_size_in_billions":72,
+ "quantizations":[
+ "Int4",
+ "Int8"
+ ],
+ "model_id":"Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}"
}
],
"prompt_style":{
@@ -7247,7 +7272,99 @@
},
{
"version": 1,
- "context_length": 131072,
+ "context_length": 32768,
+ "model_name": "qwen2.5",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-0.5B",
+ "model_revision": "2630d3d2321bc1f1878f702166d1b2af019a7310"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-1.5B",
+ "model_revision": "e5dfabbcffd9b0c7b31d89b82c5a6b72e663f32c"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-3B",
+ "model_revision": "e4aa5ac50aa507415cda96cc99eb77ad0a3d2d34"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-7B",
+ "model_revision": "09a0bac5707b43ec44508eab308b0846320c1ed4"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-14B",
+ "model_revision": "d02b64ba1ce86bf9948668a13f82709600431ccc"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-32B",
+ "model_revision": "ff23665d01c3665be5fdb271d18a62090b65c06d"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-72B",
+ "model_revision": "587cc4061cf6a7cc0d429d05c109447e5cf063af"
+ }
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
"model_name": "qwen2.5-instruct",
"model_lang": [
"en",
@@ -7459,11 +7576,10 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf"
+ "model_file_name_template": "qwen2.5-0.5b-instruct-{quantization}.gguf"
},
{
"model_format": "ggufv2",
@@ -7476,11 +7592,10 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf"
+ "model_file_name_template": "qwen2.5-1.5b-instruct-{quantization}.gguf"
},
{
"model_format": "ggufv2",
@@ -7493,11 +7608,10 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf"
+ "model_file_name_template": "qwen2.5-3b-instruct-{quantization}.gguf"
},
{
"model_format": "ggufv2",
@@ -7510,11 +7624,37 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf"
+ "model_file_name_template": "qwen2.5-7b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q4_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q4_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q5_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q5_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q6_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q8_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ]
+ }
},
{
"model_format": "ggufv2",
@@ -7527,11 +7667,53 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf"
+ "model_file_name_template": "qwen2.5-14b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-14b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q2_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q3_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q4_0": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q4_k_m": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q5_0": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q5_k_m": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q6_k": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ],
+ "q8_0": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ]
+ }
},
{
"model_format": "ggufv2",
@@ -7544,11 +7726,76 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf"
+ "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-32b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q2_k": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ],
+ "q3_k_m": [
+ "00001-of-00005",
+ "00002-of-00005",
+ "00003-of-00005",
+ "00004-of-00005",
+ "00005-of-00005"
+ ],
+ "q4_0": [
+ "00001-of-00005",
+ "00002-of-00005",
+ "00003-of-00005",
+ "00004-of-00005",
+ "00005-of-00005"
+ ],
+ "q4_k_m": [
+ "00001-of-00005",
+ "00002-of-00005",
+ "00003-of-00005",
+ "00004-of-00005",
+ "00005-of-00005"
+ ],
+ "q5_0": [
+ "00001-of-00006",
+ "00002-of-00006",
+ "00003-of-00006",
+ "00004-of-00006",
+ "00005-of-00006",
+ "00006-of-00006"
+ ],
+ "q5_k_m": [
+ "00001-of-00006",
+ "00002-of-00006",
+ "00003-of-00006",
+ "00004-of-00006",
+ "00005-of-00006",
+ "00006-of-00006"
+ ],
+ "q6_k": [
+ "00001-of-00007",
+ "00002-of-00007",
+ "00003-of-00007",
+ "00004-of-00007",
+ "00005-of-00007",
+ "00006-of-00007",
+ "00007-of-00007"
+ ],
+ "q8_0": [
+ "00001-of-00009",
+ "00002-of-00009",
+ "00003-of-00009",
+ "00004-of-00009",
+ "00005-of-00009",
+ "00006-of-00009",
+ "00007-of-00009",
+ "00008-of-00009",
+ "00009-of-00009"
+ ]
+ }
},
{
"model_format": "ggufv2",
@@ -7566,8 +7813,254 @@
],
"model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF",
"model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
- "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+ "model_file_name_split_template": "qwen2.5-72b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q2_k": [
+ "00001-of-00007",
+ "00002-of-00007",
+ "00003-of-00007",
+ "00004-of-00007",
+ "00005-of-00007",
+ "00006-of-00007",
+ "00007-of-00007"
+ ],
+ "q3_k_m": [
+ "00001-of-00009",
+ "00002-of-00009",
+ "00003-of-00009",
+ "00004-of-00009",
+ "00005-of-00009",
+ "00006-of-00009",
+ "00007-of-00009",
+ "00008-of-00009",
+ "00009-of-00009"
+ ],
+ "q4_0": [
+ "00001-of-00011",
+ "00002-of-00011",
+ "00003-of-00011",
+ "00004-of-00011",
+ "00005-of-00011",
+ "00006-of-00011",
+ "00007-of-00011",
+ "00008-of-00011",
+ "00009-of-00011",
+ "00010-of-00011",
+ "00011-of-00011"
+ ],
+ "q4_k_m": [
+ "00001-of-00012",
+ "00002-of-00012",
+ "00003-of-00012",
+ "00004-of-00012",
+ "00005-of-00012",
+ "00006-of-00012",
+ "00007-of-00012",
+ "00008-of-00012",
+ "00009-of-00012",
+ "00010-of-00012",
+ "00011-of-00012",
+ "00012-of-00012"
+ ],
+ "q5_0": [
+ "00001-of-00013",
+ "00002-of-00013",
+ "00003-of-00013",
+ "00004-of-00013",
+ "00005-of-00013",
+ "00006-of-00013",
+ "00007-of-00013",
+ "00008-of-00013",
+ "00009-of-00013",
+ "00010-of-00013",
+ "00011-of-00013",
+ "00012-of-00013",
+ "00013-of-00013"
+ ],
+ "q5_k_m": [
+ "00001-of-00014",
+ "00002-of-00014",
+ "00003-of-00014",
+ "00004-of-00014",
+ "00005-of-00014",
+ "00006-of-00014",
+ "00007-of-00014",
+ "00008-of-00014",
+ "00009-of-00014",
+ "00010-of-00014",
+ "00011-of-00014",
+ "00012-of-00014",
+ "00013-of-00014",
+ "00014-of-00014"
+ ],
+ "q6_k": [
+ "00001-of-00016",
+ "00002-of-00016",
+ "00003-of-00016",
+ "00004-of-00016",
+ "00005-of-00016",
+ "00006-of-00016",
+ "00007-of-00016",
+ "00008-of-00016",
+ "00009-of-00016",
+ "00010-of-00016",
+ "00011-of-00016",
+ "00012-of-00016",
+ "00013-of-00016",
+ "00014-of-00016",
+ "00015-of-00016",
+ "00016-of-00016"
+ ],
+ "q8_0": [
+ "00001-of-00021",
+ "00002-of-00021",
+ "00003-of-00021",
+ "00004-of-00021",
+ "00005-of-00021",
+ "00006-of-00021",
+ "00007-of-00021",
+ "00008-of-00021",
+ "00009-of-00021",
+ "00010-of-00021",
+ "00011-of-00021",
+ "00012-of-00021",
+ "00013-of-00021",
+ "00014-of-00021",
+ "00015-of-00021",
+ "00016-of-00021",
+ "00017-of-00021",
+ "00018-of-00021",
+ "00019-of-00021",
+ "00020-of-00021",
+ "00021-of-00021"
+ ]
+ }
+ }
+ ],
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+ "stop_token_ids": [
+ 151643,
+ 151644,
+ 151645
+ ],
+ "stop": [
+ "<|endoftext|>",
+ "<|im_start|>",
+ "<|im_end|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
+ "model_name": "qwen2.5-coder",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-Coder-1.5B",
+ "model_revision": "d3586cfe793730945f8e4d7ef31032a3ee50247d"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-Coder-7B",
+ "model_revision": "30b6a7e874a78d46b80fa1db3194ea427dd41b08"
+ }
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
+ "model_name": "qwen2.5-coder-instruct",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat",
+ "tools"
+ ],
+ "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0"
+ ],
+ "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
+ "model_file_name_template": "qwen2.5-coder-1.5b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0"
+ ],
+ "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
+ "model_file_name_template": "qwen2.5-coder-7b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-coder-7b-instruct-{quantization}-{part}.gguf",
"quantization_parts": {
+ "q4_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q4_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
"q5_0": [
"00001-of-00002",
"00002-of-00002"
@@ -7581,19 +8074,14 @@
"00002-of-00002"
],
"q8_0": [
- "00001-of-00002",
- "00002-of-00002"
- ],
- "fp16": [
- "00001-of-00004",
- "00002-of-00004",
- "00003-of-00004",
- "00004-of-00004"
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
]
}
}
],
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
"stop_token_ids": [
151643,
151644,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 7309ee9651..daf726e8c7 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4602,6 +4602,34 @@
"model_hub": "modelscope",
"model_id":"qwen/Qwen2-VL-2B-Instruct-AWQ",
"model_revision":"master"
+ },
+ {
+ "model_format":"pytorch",
+ "model_size_in_billions":72,
+ "quantizations":[
+ "none"
+ ],
+ "model_id":"qwen/Qwen2-VL-72B-Instruct",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format":"awq",
+ "model_size_in_billions":72,
+ "quantizations":[
+ "Int4"
+ ],
+ "model_id":"qwen/Qwen2-VL-72B-Instruct-AWQ",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format":"gptq",
+ "model_size_in_billions":72,
+ "quantizations":[
+ "Int4",
+ "Int8"
+ ],
+ "model_id":"qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}",
+ "model_hub": "modelscope"
}
],
"prompt_style": {
@@ -4960,7 +4988,106 @@
},
{
"version": 1,
- "context_length": 131072,
+ "context_length": 32768,
+ "model_name": "qwen2.5",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "0_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-0.5B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-1.5B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 3,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-3B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-7B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 14,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-14B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 32,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-32B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 72,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-72B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ }
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
"model_name": "qwen2.5-instruct",
"model_lang": [
"en",
@@ -5193,11 +5320,10 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf",
+ "model_file_name_template": "qwen2.5-0.5b-instruct-{quantization}.gguf",
"model_hub": "modelscope"
},
{
@@ -5211,11 +5337,10 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf",
+ "model_file_name_template": "qwen2.5-1.5b-instruct-{quantization}.gguf",
"model_hub": "modelscope"
},
{
@@ -5229,11 +5354,10 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-3B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf",
+ "model_file_name_template": "qwen2.5-3b-instruct-{quantization}.gguf",
"model_hub": "modelscope"
},
{
@@ -5247,12 +5371,38 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
"model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf",
- "model_hub": "modelscope"
+ "model_hub": "modelscope",
+ "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q4_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q4_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q5_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q5_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q6_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q8_0": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ]
+ }
},
{
"model_format": "ggufv2",
@@ -5265,11 +5415,53 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-14B-Instruct-GGUF",
- "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf",
+ "model_file_name_template": "qwen2.5-14b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-14b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q2_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q3_k_m": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q4_0": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q4_k_m": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q5_0": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q5_k_m": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
+ ],
+ "q6_k": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ],
+ "q8_0": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ]
+ },
"model_hub": "modelscope"
},
{
@@ -5283,11 +5475,76 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-32B-Instruct-GGUF",
"model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-32b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q2_k": [
+ "00001-of-00004",
+ "00002-of-00004",
+ "00003-of-00004",
+ "00004-of-00004"
+ ],
+ "q3_k_m": [
+ "00001-of-00005",
+ "00002-of-00005",
+ "00003-of-00005",
+ "00004-of-00005",
+ "00005-of-00005"
+ ],
+ "q4_0": [
+ "00001-of-00005",
+ "00002-of-00005",
+ "00003-of-00005",
+ "00004-of-00005",
+ "00005-of-00005"
+ ],
+ "q4_k_m": [
+ "00001-of-00005",
+ "00002-of-00005",
+ "00003-of-00005",
+ "00004-of-00005",
+ "00005-of-00005"
+ ],
+ "q5_0": [
+ "00001-of-00006",
+ "00002-of-00006",
+ "00003-of-00006",
+ "00004-of-00006",
+ "00005-of-00006",
+ "00006-of-00006"
+ ],
+ "q5_k_m": [
+ "00001-of-00006",
+ "00002-of-00006",
+ "00003-of-00006",
+ "00004-of-00006",
+ "00005-of-00006",
+ "00006-of-00006"
+ ],
+ "q6_k": [
+ "00001-of-00007",
+ "00002-of-00007",
+ "00003-of-00007",
+ "00004-of-00007",
+ "00005-of-00007",
+ "00006-of-00007",
+ "00007-of-00007"
+ ],
+ "q8_0": [
+ "00001-of-00009",
+ "00002-of-00009",
+ "00003-of-00009",
+ "00004-of-00009",
+ "00005-of-00009",
+ "00006-of-00009",
+ "00007-of-00009",
+ "00008-of-00009",
+ "00009-of-00009"
+ ]
+ },
"model_hub": "modelscope"
},
{
@@ -5301,40 +5558,288 @@
"q5_0",
"q5_k_m",
"q6_k",
- "q8_0",
- "fp16"
+ "q8_0"
],
"model_id": "qwen/Qwen2.5-72B-Instruct-GGUF",
"model_hub": "modelscope",
"model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf",
- "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf",
+ "model_file_name_split_template": "qwen2.5-72b-instruct-{quantization}-{part}.gguf",
"quantization_parts": {
+ "q2_k": [
+ "00001-of-00007",
+ "00002-of-00007",
+ "00003-of-00007",
+ "00004-of-00007",
+ "00005-of-00007",
+ "00006-of-00007",
+ "00007-of-00007"
+ ],
+ "q3_k_m": [
+ "00001-of-00009",
+ "00002-of-00009",
+ "00003-of-00009",
+ "00004-of-00009",
+ "00005-of-00009",
+ "00006-of-00009",
+ "00007-of-00009",
+ "00008-of-00009",
+ "00009-of-00009"
+ ],
+ "q4_0": [
+ "00001-of-00011",
+ "00002-of-00011",
+ "00003-of-00011",
+ "00004-of-00011",
+ "00005-of-00011",
+ "00006-of-00011",
+ "00007-of-00011",
+ "00008-of-00011",
+ "00009-of-00011",
+ "00010-of-00011",
+ "00011-of-00011"
+ ],
+ "q4_k_m": [
+ "00001-of-00012",
+ "00002-of-00012",
+ "00003-of-00012",
+ "00004-of-00012",
+ "00005-of-00012",
+ "00006-of-00012",
+ "00007-of-00012",
+ "00008-of-00012",
+ "00009-of-00012",
+ "00010-of-00012",
+ "00011-of-00012",
+ "00012-of-00012"
+ ],
"q5_0": [
+ "00001-of-00013",
+ "00002-of-00013",
+ "00003-of-00013",
+ "00004-of-00013",
+ "00005-of-00013",
+ "00006-of-00013",
+ "00007-of-00013",
+ "00008-of-00013",
+ "00009-of-00013",
+ "00010-of-00013",
+ "00011-of-00013",
+ "00012-of-00013",
+ "00013-of-00013"
+ ],
+ "q5_k_m": [
+ "00001-of-00014",
+ "00002-of-00014",
+ "00003-of-00014",
+ "00004-of-00014",
+ "00005-of-00014",
+ "00006-of-00014",
+ "00007-of-00014",
+ "00008-of-00014",
+ "00009-of-00014",
+ "00010-of-00014",
+ "00011-of-00014",
+ "00012-of-00014",
+ "00013-of-00014",
+ "00014-of-00014"
+ ],
+ "q6_k": [
+ "00001-of-00016",
+ "00002-of-00016",
+ "00003-of-00016",
+ "00004-of-00016",
+ "00005-of-00016",
+ "00006-of-00016",
+ "00007-of-00016",
+ "00008-of-00016",
+ "00009-of-00016",
+ "00010-of-00016",
+ "00011-of-00016",
+ "00012-of-00016",
+ "00013-of-00016",
+ "00014-of-00016",
+ "00015-of-00016",
+ "00016-of-00016"
+ ],
+ "q8_0": [
+ "00001-of-00021",
+ "00002-of-00021",
+ "00003-of-00021",
+ "00004-of-00021",
+ "00005-of-00021",
+ "00006-of-00021",
+ "00007-of-00021",
+ "00008-of-00021",
+ "00009-of-00021",
+ "00010-of-00021",
+ "00011-of-00021",
+ "00012-of-00021",
+ "00013-of-00021",
+ "00014-of-00021",
+ "00015-of-00021",
+ "00016-of-00021",
+ "00017-of-00021",
+ "00018-of-00021",
+ "00019-of-00021",
+ "00020-of-00021",
+ "00021-of-00021"
+ ]
+ }
+ }
+ ],
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+ "stop_token_ids": [
+ 151643,
+ 151644,
+ 151645
+ ],
+ "stop": [
+ "<|endoftext|>",
+ "<|im_start|>",
+ "<|im_end|>"
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
+ "model_name": "qwen2.5-coder",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "generate"
+ ],
+ "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-Coder-1.5B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-Coder-7B",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ }
+ ]
+ },
+ {
+ "version": 1,
+ "context_length": 32768,
+ "model_name": "qwen2.5-coder-instruct",
+ "model_lang": [
+ "en",
+ "zh"
+ ],
+ "model_ability": [
+ "chat",
+ "tools"
+ ],
+ "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "qwen/Qwen2.5-Coder-7B-Instruct",
+ "model_revision": "master",
+ "model_hub": "modelscope"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": "1_5",
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
+ "model_file_name_template": "qwen2.5-coder-1.5b-instruct-{quantization}.gguf"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "q2_k",
+ "q3_k_m",
+ "q4_0",
+ "q4_k_m",
+ "q5_0",
+ "q5_k_m",
+ "q6_k",
+ "q8_0"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
+ "model_file_name_template": "qwen2.5-coder-7b-instruct-{quantization}.gguf",
+ "model_file_name_split_template": "qwen2.5-coder-7b-instruct-{quantization}-{part}.gguf",
+ "quantization_parts": {
+ "q4_0": [
"00001-of-00002",
"00002-of-00002"
],
- "q5_k_m": [
+ "q4_k_m": [
"00001-of-00002",
"00002-of-00002"
],
- "q6_k": [
+ "q5_0": [
"00001-of-00002",
"00002-of-00002"
],
- "q8_0": [
+ "q5_k_m": [
"00001-of-00002",
"00002-of-00002"
],
- "fp16": [
- "00001-of-00004",
- "00002-of-00004",
- "00003-of-00004",
- "00004-of-00004"
+ "q6_k": [
+ "00001-of-00002",
+ "00002-of-00002"
+ ],
+ "q8_0": [
+ "00001-of-00003",
+ "00002-of-00003",
+ "00003-of-00003"
]
}
}
],
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
"stop_token_ids": [
151643,
151644,
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 621b9b0a59..a413f2ad0f 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -68,6 +68,8 @@ class SGLANGGenerateConfig(TypedDict, total=False):
"llama-3.1",
"mistral-v0.1",
"mixtral-v0.1",
+ "qwen2.5",
+ "qwen2.5-coder",
]
SGLANG_SUPPORTED_CHAT_MODELS = [
"llama-2-chat",
@@ -85,6 +87,8 @@ class SGLANGGenerateConfig(TypedDict, total=False):
"deepseek-v2.5",
"deepseek-v2-chat",
"deepseek-v2-chat-0628",
+ "qwen2.5-instruct",
+ "qwen2.5-coder-instruct",
]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 3aaee0738f..8b28701778 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -138,7 +138,11 @@ class VLLMGenerateConfig(TypedDict, total=False):
VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
+ VLLM_SUPPORTED_MODELS.append("qwen2.5")
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
+ VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
+
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")