From 56de933d3cc03761898eff66ec83f71b024f83f6 Mon Sep 17 00:00:00 2001 From: amumu96 <128140880+amumu96@users.noreply.github.com> Date: Fri, 13 Sep 2024 11:51:31 +0800 Subject: [PATCH 01/17] FEAT: support deepseek-v2 and 2.5 (#2292) Co-authored-by: wuzhaoxin <15667065080@162.com> --- xinference/model/llm/__init__.py | 6 + xinference/model/llm/llm_family.json | 147 ++++++++ .../model/llm/llm_family_modelscope.json | 153 ++++++++ xinference/model/llm/sglang/core.py | 3 + xinference/model/llm/transformers/core.py | 4 + .../model/llm/transformers/deepseek_v2.py | 340 ++++++++++++++++++ xinference/model/llm/utils.py | 26 ++ xinference/model/llm/vllm/core.py | 6 + 8 files changed, 685 insertions(+) create mode 100644 xinference/model/llm/transformers/deepseek_v2.py diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index 1980a4b81f..5a7895eb1a 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -136,6 +136,10 @@ def _install(): from .transformers.cogvlm2 import CogVLM2Model from .transformers.cogvlm2_video import CogVLM2VideoModel from .transformers.core import PytorchChatModel, PytorchModel + from .transformers.deepseek_v2 import ( + DeepSeekV2PytorchChatModel, + DeepSeekV2PytorchModel, + ) from .transformers.deepseek_vl import DeepSeekVLChatModel from .transformers.glm4v import Glm4VModel from .transformers.intern_vl import InternVLChatModel @@ -182,6 +186,8 @@ def _install(): MiniCPMV25Model, MiniCPMV26Model, Glm4VModel, + DeepSeekV2PytorchModel, + DeepSeekV2PytorchChatModel, ] ) if OmniLMMModel: # type: ignore diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 7f428ee005..e997098e65 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -6946,5 +6946,152 @@ "", "" ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Lite", + "model_revision": "604d5664dddd88a0433dbae533b7fe9472482de0" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2", + "model_revision": "4461458f186c35188585855f28f77af5661ad489" + } + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "model_revision": "85864749cd611b4353ce1decdb286193298f64c7" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Chat", + "model_revision": "8e3f5f6c2226787e41ba3e9283a06389d178c926" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2-chat-0628", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. ", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Chat-0628", + "model_revision": "5d09e272c2b223830f4e84359cd9dd047a5d7c78" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2.5", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2.5", + "model_revision": "24b08cb750e0c2757de112d2e16327cb21ed4833" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %} {%- if message['role'] == 'system' %} {% set ns.system_prompt = message['content'] %} {%- endif %}{%- endfor %}{{'<|begin▁of▁sentence|>'}}{{ns.system_prompt}}{%- for message in messages %} {%- if message['role'] == 'user' %} {%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is none %} {%- set ns.is_tool = false -%} {%- for tool in message['tool_calls']%} {%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} {%- set ns.is_first = true -%} {%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} {%- endif %} {%- endfor %} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is not none %} {%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- set ns.is_tool = false -%} {%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- endif %} {%- endif %} {%- if message['role'] == 'tool' %} {%- set ns.is_tool = true -%} {%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- set ns.is_output_first = false %} {%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- endif %} {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index eb24dd8180..f4386e85fa 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4655,5 +4655,158 @@ "", "" ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Lite", + "model_hub": "modelscope", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2", + "model_hub": "modelscope", + "model_revision": "master" + } + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. ", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "model_hub": "modelscope", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Chat", + "model_hub": "modelscope", + "model_revision": "master" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2-chat-0628", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. ", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2-Chat-0628", + "model_hub": "modelscope", + "model_revision": "master" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ '<|begin▁of▁sentence|>' }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>' }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "deepseek-v2.5", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 236, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V2.5", + "model_hub": "modelscope", + "model_revision": "master" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %} {%- if message['role'] == 'system' %} {% set ns.system_prompt = message['content'] %} {%- endif %}{%- endfor %}{{'<|begin▁of▁sentence|>'}}{{ns.system_prompt}}{%- for message in messages %} {%- if message['role'] == 'user' %} {%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is none %} {%- set ns.is_tool = false -%} {%- for tool in message['tool_calls']%} {%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} {%- set ns.is_first = true -%} {%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} {%- endif %} {%- endfor %} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is not none %} {%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- set ns.is_tool = false -%} {%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}} {%- endif %} {%- endif %} {%- if message['role'] == 'tool' %} {%- set ns.is_tool = true -%} {%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- set ns.is_output_first = false %} {%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} {%- endif %} {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}", + "stop_token_ids": [ + 100001 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] } ] diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 578252324d..621b9b0a59 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -82,6 +82,9 @@ class SGLANGGenerateConfig(TypedDict, total=False): "mixtral-instruct-v0.1", "gemma-it", "gemma-2-it", + "deepseek-v2.5", + "deepseek-v2-chat", + "deepseek-v2-chat-0628", ] diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index 32419a56f1..a451b7accd 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -65,6 +65,10 @@ "MiniCPM-V-2.6", "glm-4v", "qwen2-vl-instruct", + "deepseek-v2", + "deepseek-v2-chat", + "deepseek-v2.5", + "deepseek-v2-chat-0628", ] diff --git a/xinference/model/llm/transformers/deepseek_v2.py b/xinference/model/llm/transformers/deepseek_v2.py new file mode 100644 index 0000000000..b6ce2b5e04 --- /dev/null +++ b/xinference/model/llm/transformers/deepseek_v2.py @@ -0,0 +1,340 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import uuid +from typing import Dict, Iterator, List, Optional, Union + +import torch + +from ....types import ( + ChatCompletion, + ChatCompletionChunk, + Completion, + CompletionChunk, + PytorchGenerateConfig, +) +from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import ( + generate_chat_completion, + generate_completion, + generate_completion_chunk, +) +from .core import PytorchChatModel, PytorchModel + +logger = logging.getLogger(__name__) + + +class DeepSeekV2PytorchModel(PytorchModel): + def _load_model(self, **kwargs): + try: + from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, + ) + except ImportError: + error_message = "Failed to import module 'transformers'" + installation_guide = [ + "Please make sure 'transformers' is installed. ", + "You can install it by `pip install transformers`\n", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + + tokenizer = AutoTokenizer.from_pretrained( + self.model_path, + trust_remote_code=kwargs["trust_remote_code"], + ) + model = AutoModelForCausalLM.from_pretrained( + self.model_path, + attn_implementation="eager", + torch_dtype=torch.bfloat16, + trust_remote_code=True, + device_map="auto", + ) + model.generation_config = GenerationConfig.from_pretrained(self.model_path) + model.generation_config.pad_token_id = model.generation_config.eos_token_id + return model, tokenizer + + @classmethod + def match( + cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str + ) -> bool: + if llm_spec.model_format != "pytorch": + return False + model_family = llm_family.model_family or llm_family.model_name + if "deepseek-v2" not in model_family: + return False + if "generate" not in llm_family.model_ability: + return False + return True + + def generate( + self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None + ) -> Union[Completion, Iterator[CompletionChunk]]: + input_tensor = self._tokenizer(prompt, return_tensors="pt") + generate_config = self._sanitize_generate_config(generate_config) + default_generate_config = self._model.generation_config + generate_kwargs = { + "input_ids": input_tensor["input_ids"].cuda(), + "attention_mask": input_tensor["attention_mask"].cuda(), + "temperature": float( + generate_config.get("temperature", default_generate_config.temperature) + ), + "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)), + "top_p": float(generate_config.get("top_p", default_generate_config.top_p)), + "top_k": int(generate_config.get("top_k", -1)), + "max_new_tokens": generate_config.get("max_tokens", 512), + "bos_token_id": default_generate_config.bos_token_id, + "do_sample": default_generate_config.do_sample, + "eos_token_id": default_generate_config.eos_token_id, + } + + stream = generate_config.get("stream", False) + if stream: + return self._generate_stream(generate_kwargs, input_tensor) + else: + return self._generate(generate_kwargs, input_tensor) + + def _generate(self, generate_kwargs, input_ids) -> Completion: + prompt_tokens = len(input_ids[0]) + logger.info(f"generate_kwargs:{generate_kwargs}") + generation_output = self._model.generate(**generate_kwargs) + completion_tokens = len(generation_output[0]) + response = self._tokenizer.decode( + generation_output[0], skip_special_tokens=True + ) + return generate_completion( + self.model_uid, + response, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + def _generate_stream(self, generate_kwargs, input_ids): + from threading import Thread + + from transformers import TextIteratorStreamer + + # Initialize the streamer + streamer = TextIteratorStreamer( + self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10 + ) + # Define the generation configuration + generate_kwargs["streamer"] = streamer + # Start the model chat in a separate thread + thread = Thread( + target=self._model.generate, + kwargs=generate_kwargs, + ) + thread.start() + + completion_id = str(uuid.uuid1()) + prompt_tokens = len(input_ids[0]) + total_tokens, completion_tokens = 0, 0 + # Loop through the streamer to get the new text as it is generated + for i, new_text in enumerate(streamer): + completion_tokens = i + total_tokens = prompt_tokens + completion_tokens + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=True, + has_content=False, + ) + + +class DeepSeekV2PytorchChatModel(PytorchChatModel): + def _load_model(self, **kwargs): + try: + from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, + ) + except ImportError: + error_message = "Failed to import module 'transformers'" + installation_guide = [ + "Please make sure 'transformers' is installed. ", + "You can install it by `pip install transformers`\n", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + + tokenizer = AutoTokenizer.from_pretrained( + self.model_path, + trust_remote_code=kwargs["trust_remote_code"], + ) + logger.info(f"kwargs:{kwargs}") + model = AutoModelForCausalLM.from_pretrained( + self.model_path, + attn_implementation="eager", + torch_dtype=torch.bfloat16, + trust_remote_code=True, + device_map="auto", + ) + model.generation_config = GenerationConfig.from_pretrained(self.model_path) + model.generation_config.pad_token_id = model.generation_config.eos_token_id + return model, tokenizer + + @classmethod + def match( + cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str + ) -> bool: + if llm_spec.model_format != "pytorch": + return False + model_family = llm_family.model_family or llm_family.model_name + if "deepseek-v2" not in model_family: + return False + if "chat" not in llm_family.model_ability: + return False + return True + + def chat( + self, + messages: List[Dict], + generate_config: Optional[PytorchGenerateConfig] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + assert self.model_family.chat_template is not None + full_prompt = self.get_full_context( + messages, + self.model_family.chat_template, + tokenizer=self._tokenizer, + ) + input_tensor = self._tokenizer.encode( + full_prompt, + padding=False, + truncation=False, + max_length=None, + add_special_tokens=False, + return_tensors="pt", + ) + + generate_config = self._sanitize_generate_config(generate_config) + default_generate_config = self._model.generation_config + generate_kwargs = { + "input_ids": input_tensor.cuda(), + "temperature": float( + generate_config.get("temperature", default_generate_config.temperature) + ), + "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)), + "top_p": float(generate_config.get("top_p", default_generate_config.top_p)), + "top_k": int(generate_config.get("top_k", -1)), + "max_new_tokens": generate_config.get("max_tokens", 512), + "bos_token_id": default_generate_config.bos_token_id, + "do_sample": default_generate_config.do_sample, + "eos_token_id": default_generate_config.eos_token_id, + } + + stream = generate_config.get("stream", False) + stream_options = generate_config.get("stream_options", None) + include_usage = ( + stream_options["include_usage"] + if isinstance(stream_options, dict) + else False + ) + if stream: + chunk = self._generate_stream(generate_kwargs, input_tensor, include_usage) + return self._to_chat_completion_chunks(chunk) + else: + return self._generate(generate_kwargs, input_tensor) + + def _generate(self, generate_kwargs, input_ids) -> ChatCompletion: + prompt_tokens = len(input_ids[0]) + generation_output = self._model.generate(**generate_kwargs) + completion_tokens = len(generation_output[0]) + response = self._tokenizer.decode( + generation_output[0][input_ids.shape[1] :], skip_special_tokens=True + ) + return generate_chat_completion( + self.model_uid, + response, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + def _generate_stream(self, generate_kwargs, input_ids, include_usage): + from threading import Thread + + from transformers import TextIteratorStreamer + + # Initialize the streamer + streamer = TextIteratorStreamer( + self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10 + ) + # Define the generation configuration + generate_kwargs["streamer"] = streamer + # Start the model chat in a separate thread + thread = Thread( + target=self._model.generate, + kwargs=generate_kwargs, + ) + thread.start() + + completion_id = str(uuid.uuid1()) + prompt_tokens = len(input_ids[0]) + total_tokens, completion_tokens = 0, 0 + # Loop through the streamer to get the new text as it is generated + for i, new_text in enumerate(streamer): + completion_tokens = max(completion_tokens, len(streamer.token_cache)) + total_tokens = prompt_tokens + completion_tokens + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=True, + has_content=False, + ) + + if include_usage: + yield generate_completion_chunk( + chunk_text=None, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + has_choice=False, + has_content=False, + ) diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index 0ae802c01c..c5b26027fb 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -549,6 +549,32 @@ def generate_completion_chunk( ) +def generate_completion( + model_uid: str, + response: str, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + finish_reason="stop", +) -> Completion: + return Completion( + id=str(uuid.uuid1()), + object="text_completion", + created=int(time.time()), + model=model_uid, + choices=[ + CompletionChoice( + text=response, index=0, logprobs=None, finish_reason=finish_reason + ) + ], + usage=CompletionUsage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ), + ) + + def generate_chat_completion( model_uid: str, response: str, diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 8869f7fb4a..e531769a18 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -149,6 +149,12 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct") VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01") +if VLLM_INSTALLED and vllm.__version__ >= "0.5.1": + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat") + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628") + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5") + + if VLLM_INSTALLED and vllm.__version__ >= "0.5.3": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it") VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct") From 42745077b24a2b517e565235756c5ff317f98f77 Mon Sep 17 00:00:00 2001 From: Poet <42093310+LaureatePoet@users.noreply.github.com> Date: Fri, 13 Sep 2024 11:52:47 +0800 Subject: [PATCH 02/17] FEAT: Update Qwen2-VL-Model to support flash_attention_2 implementation (#2289) Co-authored-by: qinxuye --- xinference/model/llm/transformers/qwen2_vl.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/transformers/qwen2_vl.py b/xinference/model/llm/transformers/qwen2_vl.py index 6b27a05139..3eccc0c736 100644 --- a/xinference/model/llm/transformers/qwen2_vl.py +++ b/xinference/model/llm/transformers/qwen2_vl.py @@ -11,7 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib.util import logging +import sys import uuid from typing import Iterator, List, Optional, Union @@ -59,9 +61,19 @@ def load(self): self.model_path, trust_remote_code=True ) self._tokenizer = self._processor.tokenizer - self._model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_path, device_map=device, trust_remote_code=True - ).eval() + flash_attn_installed = importlib.util.find_spec("flash_attn") is not None + if flash_attn_installed: + self._model = Qwen2VLForConditionalGeneration.from_pretrained( + self.model_path, + torch_dtype="bfloat16", + device_map=device, + attn_implementation="flash_attention_2", + trust_remote_code=True, + ).eval() + else: + self._model = Qwen2VLForConditionalGeneration.from_pretrained( + self.model_path, device_map=device, trust_remote_code=True + ).eval() def _transform_messages( self, @@ -177,8 +189,18 @@ def _generate_stream( "streamer": streamer, **inputs, } - - thread = Thread(target=self._model.generate, kwargs=gen_kwargs) + error = None + + def model_generate(): + try: + return self._model.generate(**gen_kwargs) + except Exception: + nonlocal error + error = sys.exc_info() + streamer.end() + raise + + thread = Thread(target=model_generate) thread.start() completion_id = str(uuid.uuid1()) @@ -195,6 +217,10 @@ def _generate_stream( has_content=True, ) + if error: + _, err, tb = error # type: ignore + raise err.with_traceback(tb) + yield generate_completion_chunk( chunk_text=None, finish_reason="stop", From 8f73b0550d1a55328fe165c46ada66dee45abf27 Mon Sep 17 00:00:00 2001 From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com> Date: Fri, 13 Sep 2024 06:02:31 +0200 Subject: [PATCH 03/17] ENH: Support fish speech 1.4 (#2295) --- setup.cfg | 2 + xinference/deploy/docker/requirements.txt | 1 + xinference/deploy/docker/requirements_cpu.txt | 1 + xinference/model/audio/fish_speech.py | 14 +- xinference/model/audio/model_spec.json | 6 +- .../model/audio/tests/test_fish_speech.py | 2 +- .../fish_speech/configs/firefly_gan_vq.yaml | 5 +- .../configs/text2semantic_finetune.yaml | 2 +- .../fish_speech/i18n/locale/en_US.json | 2 +- .../fish_speech/i18n/locale/es_ES.json | 2 +- .../fish_speech/i18n/locale/ja_JP.json | 2 +- .../fish_speech/i18n/locale/pt_BR.json | 2 +- .../fish_speech/i18n/locale/zh_CN.json | 2 +- .../fish_speech/models/text2semantic/llama.py | 4 +- .../fish_speech/models/vqgan/__init__.py | 3 - .../fish_speech/models/vqgan/lit_module.py | 442 ------------------ .../models/vqgan/modules/discriminator.py | 44 -- .../models/vqgan/modules/firefly.py | 367 +++++++-------- .../fish_speech/models/vqgan/modules/fsq.py | 31 +- .../models/vqgan/modules/reference.py | 115 ----- .../models/vqgan/modules/wavenet.py | 225 --------- .../fish_speech/fish_speech/text/clean.py | 56 +-- .../fish_speech/fish_speech/text/spliter.py | 4 +- .../fish_speech/fish_speech/train.py | 2 + .../fish_speech/fish_speech/webui/manage.py | 22 +- .../thirdparty/fish_speech/tools/api.py | 213 ++++----- .../fish_speech/tools/auto_rerank.py | 159 ------- .../thirdparty/fish_speech/tools/commons.py | 35 ++ .../fish_speech/tools/download_models.py | 6 +- .../thirdparty/fish_speech/tools/file.py | 17 + .../thirdparty/fish_speech/tools/gen_ref.py | 36 -- .../fish_speech/tools/llama/build_dataset.py | 2 +- .../fish_speech/tools/llama/generate.py | 53 ++- .../fish_speech/tools/llama/merge_lora.py | 2 +- .../fish_speech/tools/llama/quantize.py | 4 +- .../fish_speech/tools/merge_asr_files.py | 55 --- .../fish_speech/tools/msgpack_api.py | 34 ++ .../thirdparty/fish_speech/tools/post_api.py | 129 +++-- .../fish_speech/tools/sensevoice/fun_asr.py | 2 +- .../thirdparty/fish_speech/tools/smart_pad.py | 19 +- .../fish_speech/tools/vqgan/extract_vq.py | 4 +- .../fish_speech/tools/vqgan/inference.py | 6 +- .../thirdparty/fish_speech/tools/webui.py | 158 +------ 43 files changed, 544 insertions(+), 1748 deletions(-) delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py delete mode 100644 xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py delete mode 100644 xinference/thirdparty/fish_speech/tools/auto_rerank.py create mode 100644 xinference/thirdparty/fish_speech/tools/commons.py delete mode 100644 xinference/thirdparty/fish_speech/tools/gen_ref.py delete mode 100644 xinference/thirdparty/fish_speech/tools/merge_asr_files.py create mode 100644 xinference/thirdparty/fish_speech/tools/msgpack_api.py diff --git a/setup.cfg b/setup.cfg index e95ba7ca3a..55f5117c14 100644 --- a/setup.cfg +++ b/setup.cfg @@ -127,6 +127,7 @@ all = loguru # For Fish Speech natsort # For Fish Speech loralib # For Fish Speech + ormsgpack # For Fish Speech qwen-vl-utils # For qwen2-vl datamodel_code_generator # for minicpm-4B jsonschema # for minicpm-4B @@ -198,6 +199,7 @@ audio = loguru # For Fish Speech natsort # For Fish Speech loralib # For Fish Speech + ormsgpack # For Fish Speech doc = ipython>=6.5.0 sphinx>=3.0.0 diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt index b5ac62c254..d23d72c3f9 100644 --- a/xinference/deploy/docker/requirements.txt +++ b/xinference/deploy/docker/requirements.txt @@ -70,6 +70,7 @@ jj-pytorchvideo # For CogVLM2-video loguru # For Fish Speech natsort # For Fish Speech loralib # For Fish Speech +ormsgpack # For Fish Speech qwen-vl-utils # For qwen2-vl datamodel_code_generator # for minicpm-4B jsonschema # for minicpm-4B diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt index cb1d27dc44..493f558da2 100644 --- a/xinference/deploy/docker/requirements_cpu.txt +++ b/xinference/deploy/docker/requirements_cpu.txt @@ -65,6 +65,7 @@ jj-pytorchvideo # For CogVLM2-video loguru # For Fish Speech natsort # For Fish Speech loralib # For Fish Speech +ormsgpack # For Fish Speech qwen-vl-utils # For qwen2-vl datamodel_code_generator # for minicpm-4B jsonschema # for minicpm-4B diff --git a/xinference/model/audio/fish_speech.py b/xinference/model/audio/fish_speech.py index 96766a7d27..4a6412f04a 100644 --- a/xinference/model/audio/fish_speech.py +++ b/xinference/model/audio/fish_speech.py @@ -92,7 +92,7 @@ def load(self): checkpoint_path = os.path.join( self._model_path, - "firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + "firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ) self._model = load_decoder_model( config_name="firefly_gan_vq", @@ -213,12 +213,12 @@ def speech( text=input, enable_reference_audio=False, reference_audio=None, - reference_text="", - max_new_tokens=0, - chunk_length=100, - top_p=0.7, - repetition_penalty=1.2, - temperature=0.7, + reference_text=kwargs.get("reference_text", ""), + max_new_tokens=kwargs.get("max_new_tokens", 1024), + chunk_length=kwargs.get("chunk_length", 200), + top_p=kwargs.get("top_p", 0.7), + repetition_penalty=kwargs.get("repetition_penalty", 1.2), + temperature=kwargs.get("temperature", 0.7), ) ) sample_rate, audio = result[0][1] diff --git a/xinference/model/audio/model_spec.json b/xinference/model/audio/model_spec.json index 6d546a0921..6762d84a18 100644 --- a/xinference/model/audio/model_spec.json +++ b/xinference/model/audio/model_spec.json @@ -148,10 +148,10 @@ "multilingual": true }, { - "model_name": "FishSpeech-1.2-SFT", + "model_name": "FishSpeech-1.4", "model_family": "FishAudio", - "model_id": "fishaudio/fish-speech-1.2-sft", - "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0", + "model_id": "fishaudio/fish-speech-1.4", + "model_revision": "3c49651b8e583b6b13f55e375432e0d57e1aa84d", "model_ability": "text-to-audio", "multilingual": true } diff --git a/xinference/model/audio/tests/test_fish_speech.py b/xinference/model/audio/tests/test_fish_speech.py index 8b339290ad..ce57566b19 100644 --- a/xinference/model/audio/tests/test_fish_speech.py +++ b/xinference/model/audio/tests/test_fish_speech.py @@ -22,7 +22,7 @@ def test_fish_speech(setup): client = Client(endpoint) model_uid = client.launch_model( - model_name="FishSpeech-1.2-SFT", + model_name="FishSpeech-1.4", model_type="audio", ) model = client.get_model(model_uid) diff --git a/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml b/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml index 7417623b03..10aa8d4a52 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +++ b/xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml @@ -22,13 +22,12 @@ head: resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] num_mels: 512 upsample_initial_channel: 512 - use_template: false pre_conv_kernel_size: 13 post_conv_kernel_size: 13 quantizer: _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize input_dim: 512 - n_groups: 4 + n_groups: 8 n_codebooks: 1 levels: [8, 5, 5, 5] - downsample_factor: [2] + downsample_factor: [2, 2] diff --git a/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml b/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml index 1bf8fd6b6d..f4c1993023 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +++ b/xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml @@ -4,7 +4,7 @@ defaults: project: text2semantic_finetune_dual_ar max_length: 4096 -pretrained_ckpt_path: checkpoints/fish-speech-1.2-sft +pretrained_ckpt_path: checkpoints/fish-speech-1.4 # Lightning Trainer trainer: diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json index cf6ad6ca1e..6e280c236e 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json @@ -72,7 +72,7 @@ "Put your text here.": "Put your text here.", "Reference Audio": "Reference Audio", "Reference Text": "Reference Text", - "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.", + "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.", "Remove Selected Data": "Remove Selected Data", "Removed path successfully!": "Removed path successfully!", "Repetition Penalty": "Repetition Penalty", diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json index 1ea5988213..3285341f68 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json @@ -72,7 +72,7 @@ "Put your text here.": "Ponga su texto aquí.", "Reference Audio": "Audio de Referencia", "Reference Text": "Texto de Referencia", - "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado se publica bajo la Licencia BSD-3-Clause, y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.", + "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.", "Remove Selected Data": "Eliminar Datos Seleccionados", "Removed path successfully!": "¡Ruta eliminada exitosamente!", "Repetition Penalty": "Penalización por Repetición", diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json index e7817eb0c5..d30bac7bcd 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json @@ -72,7 +72,7 @@ "Put your text here.": "ここにテキストを入力してください。", "Reference Audio": "リファレンスオーディオ", "Reference Text": "リファレンステキスト", - "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "関連コードはBSD-3-Clauseライセンスの下でリリースされ、重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。", + "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。", "Remove Selected Data": "選択したデータを削除", "Removed path successfully!": "パスの削除に成功しました!", "Repetition Penalty": "反復ペナルティ", diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json index c3df431a40..385f20272e 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json @@ -84,7 +84,7 @@ "Reference Text": "Texto de Referência", "warning": "Aviso", "Pre-processing begins...": "O pré-processamento começou!", - "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado é licenciado sob a Licença BSD-3-Clause, e os pesos sob a Licença CC BY-NC-SA 4.0.", + "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.", "Remove Selected Data": "Remover Dados Selecionados", "Removed path successfully!": "Caminho removido com sucesso!", "Repetition Penalty": "Penalidade de Repetição", diff --git a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json index da81eef1cf..3dd1a5cd1c 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +++ b/xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json @@ -72,7 +72,7 @@ "Put your text here.": "在此处输入文本.", "Reference Audio": "参考音频", "Reference Text": "参考文本", - "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "相关代码使用 BSD-3-Clause 许可证发布,权重使用 CC BY-NC-SA 4.0 许可证发布.", + "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.", "Remove Selected Data": "移除选中数据", "Removed path successfully!": "移除路径成功!", "Repetition Penalty": "重复惩罚", diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py b/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py index 4eef92b0ba..0725dfb9b7 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +++ b/xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py @@ -353,7 +353,7 @@ def from_pretrained( if "int8" in str(Path(path)): logger.info("Using int8 weight-only quantization!") - from ...tools.llama.quantize import WeightOnlyInt8QuantHandler + from tools.llama.quantize import WeightOnlyInt8QuantHandler simple_quantizer = WeightOnlyInt8QuantHandler(model) model = simple_quantizer.convert_for_runtime() @@ -363,7 +363,7 @@ def from_pretrained( path_comps = path.name.split("-") assert path_comps[-2].startswith("g") groupsize = int(path_comps[-2][1:]) - from ...tools.llama.quantize import WeightOnlyInt4QuantHandler + from tools.llama.quantize import WeightOnlyInt4QuantHandler simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize) model = simple_quantizer.convert_for_runtime() diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py index 401c6df468..e69de29bb2 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py @@ -1,3 +0,0 @@ -from .lit_module import VQGAN - -__all__ = ["VQGAN"] diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py deleted file mode 100644 index d5fa2ccabb..0000000000 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +++ /dev/null @@ -1,442 +0,0 @@ -import itertools -import math -from typing import Any, Callable - -import lightning as L -import torch -import torch.nn.functional as F -# import wandb -from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger -from matplotlib import pyplot as plt -from torch import nn - -from fish_speech.models.vqgan.modules.discriminator import Discriminator -from fish_speech.models.vqgan.modules.wavenet import WaveNet -from fish_speech.models.vqgan.utils import avg_with_mask, plot_mel, sequence_mask - - -class VQGAN(L.LightningModule): - def __init__( - self, - optimizer: Callable, - lr_scheduler: Callable, - encoder: WaveNet, - quantizer: nn.Module, - decoder: WaveNet, - discriminator: Discriminator, - vocoder: nn.Module, - encode_mel_transform: nn.Module, - gt_mel_transform: nn.Module, - weight_adv: float = 1.0, - weight_vq: float = 1.0, - weight_mel: float = 1.0, - sampling_rate: int = 44100, - freeze_encoder: bool = False, - ): - super().__init__() - - # Model parameters - self.optimizer_builder = optimizer - self.lr_scheduler_builder = lr_scheduler - - # Modules - self.encoder = encoder - self.quantizer = quantizer - self.decoder = decoder - self.vocoder = vocoder - self.discriminator = discriminator - self.encode_mel_transform = encode_mel_transform - self.gt_mel_transform = gt_mel_transform - - # A simple linear layer to project quality to condition channels - self.quality_projection = nn.Linear(1, 768) - - # Freeze vocoder - for param in self.vocoder.parameters(): - param.requires_grad = False - - # Loss weights - self.weight_adv = weight_adv - self.weight_vq = weight_vq - self.weight_mel = weight_mel - - # Other parameters - self.sampling_rate = sampling_rate - - # Disable strict loading - self.strict_loading = False - - # If encoder is frozen - if freeze_encoder: - for param in self.encoder.parameters(): - param.requires_grad = False - - for param in self.quantizer.parameters(): - param.requires_grad = False - - self.automatic_optimization = False - - def on_save_checkpoint(self, checkpoint): - # Do not save vocoder - state_dict = checkpoint["state_dict"] - for name in list(state_dict.keys()): - if "vocoder" in name: - state_dict.pop(name) - - def configure_optimizers(self): - optimizer_generator = self.optimizer_builder( - itertools.chain( - self.encoder.parameters(), - self.quantizer.parameters(), - self.decoder.parameters(), - self.quality_projection.parameters(), - ) - ) - optimizer_discriminator = self.optimizer_builder( - self.discriminator.parameters() - ) - - lr_scheduler_generator = self.lr_scheduler_builder(optimizer_generator) - lr_scheduler_discriminator = self.lr_scheduler_builder(optimizer_discriminator) - - return ( - { - "optimizer": optimizer_generator, - "lr_scheduler": { - "scheduler": lr_scheduler_generator, - "interval": "step", - "name": "optimizer/generator", - }, - }, - { - "optimizer": optimizer_discriminator, - "lr_scheduler": { - "scheduler": lr_scheduler_discriminator, - "interval": "step", - "name": "optimizer/discriminator", - }, - }, - ) - - def training_step(self, batch, batch_idx): - optim_g, optim_d = self.optimizers() - - audios, audio_lengths = batch["audios"], batch["audio_lengths"] - - audios = audios.float() - audios = audios[:, None, :] - - with torch.no_grad(): - encoded_mels = self.encode_mel_transform(audios) - gt_mels = self.gt_mel_transform(audios) - quality = ((gt_mels.mean(-1) > -8).sum(-1) - 90) / 10 - quality = quality.unsqueeze(-1) - - mel_lengths = audio_lengths // self.gt_mel_transform.hop_length - mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2]) - mel_masks_float_conv = mel_masks[:, None, :].float() - gt_mels = gt_mels * mel_masks_float_conv - encoded_mels = encoded_mels * mel_masks_float_conv - - # Encode - encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv - - # Quantize - vq_result = self.quantizer(encoded_features) - loss_vq = getattr("vq_result", "loss", 0.0) - vq_recon_features = vq_result.z * mel_masks_float_conv - vq_recon_features = ( - vq_recon_features + self.quality_projection(quality)[:, :, None] - ) - - # VQ Decode - gen_mel = ( - self.decoder( - torch.randn_like(vq_recon_features) * mel_masks_float_conv, - condition=vq_recon_features, - ) - * mel_masks_float_conv - ) - - # Discriminator - real_logits = self.discriminator(gt_mels) - fake_logits = self.discriminator(gen_mel.detach()) - d_mask = F.interpolate( - mel_masks_float_conv, size=(real_logits.shape[2],), mode="nearest" - ) - - loss_real = avg_with_mask((real_logits - 1) ** 2, d_mask) - loss_fake = avg_with_mask(fake_logits**2, d_mask) - - loss_d = loss_real + loss_fake - - self.log( - "train/discriminator/loss", - loss_d, - on_step=True, - on_epoch=False, - prog_bar=True, - logger=True, - ) - - # Discriminator backward - optim_d.zero_grad() - self.manual_backward(loss_d) - self.clip_gradients( - optim_d, gradient_clip_val=1000.0, gradient_clip_algorithm="norm" - ) - optim_d.step() - - # Mel Loss, applying l1, using a weighted sum - mel_distance = ( - gen_mel - gt_mels - ).abs() # * 0.5 + self.ssim(gen_mel, gt_mels) * 0.5 - loss_mel_low_freq = avg_with_mask(mel_distance[:, :40, :], mel_masks_float_conv) - loss_mel_mid_freq = avg_with_mask( - mel_distance[:, 40:70, :], mel_masks_float_conv - ) - loss_mel_high_freq = avg_with_mask( - mel_distance[:, 70:, :], mel_masks_float_conv - ) - loss_mel = ( - loss_mel_low_freq * 0.6 + loss_mel_mid_freq * 0.3 + loss_mel_high_freq * 0.1 - ) - - # Adversarial Loss - fake_logits = self.discriminator(gen_mel) - loss_adv = avg_with_mask((fake_logits - 1) ** 2, d_mask) - - # Total loss - loss = ( - self.weight_vq * loss_vq - + self.weight_mel * loss_mel - + self.weight_adv * loss_adv - ) - - # Log losses - self.log( - "train/generator/loss", - loss, - on_step=True, - on_epoch=False, - prog_bar=True, - logger=True, - ) - self.log( - "train/generator/loss_vq", - loss_vq, - on_step=True, - on_epoch=False, - prog_bar=False, - logger=True, - ) - self.log( - "train/generator/loss_mel", - loss_mel, - on_step=True, - on_epoch=False, - prog_bar=False, - logger=True, - ) - self.log( - "train/generator/loss_adv", - loss_adv, - on_step=True, - on_epoch=False, - prog_bar=False, - logger=True, - ) - - # Generator backward - optim_g.zero_grad() - self.manual_backward(loss) - self.clip_gradients( - optim_g, gradient_clip_val=1000.0, gradient_clip_algorithm="norm" - ) - optim_g.step() - - scheduler_g, scheduler_d = self.lr_schedulers() - scheduler_g.step() - scheduler_d.step() - - def validation_step(self, batch: Any, batch_idx: int): - audios, audio_lengths = batch["audios"], batch["audio_lengths"] - - audios = audios.float() - audios = audios[:, None, :] - - encoded_mels = self.encode_mel_transform(audios) - gt_mels = self.gt_mel_transform(audios) - - mel_lengths = audio_lengths // self.gt_mel_transform.hop_length - mel_masks = sequence_mask(mel_lengths, gt_mels.shape[2]) - mel_masks_float_conv = mel_masks[:, None, :].float() - gt_mels = gt_mels * mel_masks_float_conv - encoded_mels = encoded_mels * mel_masks_float_conv - - # Encode - encoded_features = self.encoder(encoded_mels) * mel_masks_float_conv - - # Quantize - vq_recon_features = self.quantizer(encoded_features).z * mel_masks_float_conv - vq_recon_features = ( - vq_recon_features - + self.quality_projection( - torch.ones( - vq_recon_features.shape[0], 1, device=vq_recon_features.device - ) - * 2 - )[:, :, None] - ) - - # VQ Decode - gen_aux_mels = ( - self.decoder( - torch.randn_like(vq_recon_features) * mel_masks_float_conv, - condition=vq_recon_features, - ) - * mel_masks_float_conv - ) - loss_mel = avg_with_mask((gen_aux_mels - gt_mels).abs(), mel_masks_float_conv) - - self.log( - "val/loss_mel", - loss_mel, - on_step=False, - on_epoch=True, - prog_bar=False, - logger=True, - sync_dist=True, - ) - - recon_audios = self.vocoder(gt_mels) - gen_aux_audios = self.vocoder(gen_aux_mels) - - # only log the first batch - if batch_idx != 0: - return - - for idx, ( - gt_mel, - gen_aux_mel, - audio, - gen_aux_audio, - recon_audio, - audio_len, - ) in enumerate( - zip( - gt_mels, - gen_aux_mels, - audios.cpu().float(), - gen_aux_audios.cpu().float(), - recon_audios.cpu().float(), - audio_lengths, - ) - ): - if idx > 4: - break - - mel_len = audio_len // self.gt_mel_transform.hop_length - - image_mels = plot_mel( - [ - gt_mel[:, :mel_len], - gen_aux_mel[:, :mel_len], - ], - [ - "Ground-Truth", - "Auxiliary", - ], - ) - - if isinstance(self.logger, WandbLogger): - self.logger.experiment.log( - { - "reconstruction_mel": wandb.Image(image_mels, caption="mels"), - "wavs": [ - wandb.Audio( - audio[0, :audio_len], - sample_rate=self.sampling_rate, - caption="gt", - ), - wandb.Audio( - gen_aux_audio[0, :audio_len], - sample_rate=self.sampling_rate, - caption="aux", - ), - wandb.Audio( - recon_audio[0, :audio_len], - sample_rate=self.sampling_rate, - caption="recon", - ), - ], - }, - ) - - if isinstance(self.logger, TensorBoardLogger): - self.logger.experiment.add_figure( - f"sample-{idx}/mels", - image_mels, - global_step=self.global_step, - ) - self.logger.experiment.add_audio( - f"sample-{idx}/wavs/gt", - audio[0, :audio_len], - self.global_step, - sample_rate=self.sampling_rate, - ) - self.logger.experiment.add_audio( - f"sample-{idx}/wavs/gen", - gen_aux_audio[0, :audio_len], - self.global_step, - sample_rate=self.sampling_rate, - ) - self.logger.experiment.add_audio( - f"sample-{idx}/wavs/recon", - recon_audio[0, :audio_len], - self.global_step, - sample_rate=self.sampling_rate, - ) - - plt.close(image_mels) - - def encode(self, audios, audio_lengths): - audios = audios.float() - - mels = self.encode_mel_transform(audios) - mel_lengths = audio_lengths // self.encode_mel_transform.hop_length - mel_masks = sequence_mask(mel_lengths, mels.shape[2]) - mel_masks_float_conv = mel_masks[:, None, :].float() - mels = mels * mel_masks_float_conv - - # Encode - encoded_features = self.encoder(mels) * mel_masks_float_conv - feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor) - - return self.quantizer.encode(encoded_features), feature_lengths - - def decode(self, indices, feature_lengths, return_audios=False): - factor = math.prod(self.quantizer.downsample_factor) - mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor) - mel_masks_float_conv = mel_masks[:, None, :].float() - - z = self.quantizer.decode(indices) * mel_masks_float_conv - z = ( - z - + self.quality_projection(torch.ones(z.shape[0], 1, device=z.device) * 2)[ - :, :, None - ] - ) - - gen_mel = ( - self.decoder( - torch.randn_like(z) * mel_masks_float_conv, - condition=z, - ) - * mel_masks_float_conv - ) - - if return_audios: - return self.vocoder(gen_mel) - - return gen_mel diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py deleted file mode 100644 index 69c7df4103..0000000000 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -from torch import nn -from torch.nn.utils.parametrizations import weight_norm - - -class Discriminator(nn.Module): - def __init__(self): - super().__init__() - - blocks = [] - convs = [ - (1, 64, (3, 9), 1, (1, 4)), - (64, 128, (3, 9), (1, 2), (1, 4)), - (128, 256, (3, 9), (1, 2), (1, 4)), - (256, 512, (3, 9), (1, 2), (1, 4)), - (512, 1024, (3, 3), 1, (1, 1)), - (1024, 1, (3, 3), 1, (1, 1)), - ] - - for idx, (in_channels, out_channels, kernel_size, stride, padding) in enumerate( - convs - ): - blocks.append( - weight_norm( - nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding) - ) - ) - - if idx != len(convs) - 1: - blocks.append(nn.SiLU(inplace=True)) - - self.blocks = nn.Sequential(*blocks) - - def forward(self, x): - return self.blocks(x[:, None])[:, 0] - - -if __name__ == "__main__": - model = Discriminator() - print(sum(p.numel() for p in model.parameters()) / 1_000_000) - x = torch.randn(1, 128, 1024) - y = model(x) - print(y.shape) - print(y) diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py index 4ca0ff5882..aa21839b54 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py @@ -1,25 +1,26 @@ -# A inference only version of the FireflyGAN model - import math from functools import partial from math import prod from typing import Callable -import numpy as np import torch import torch.nn.functional as F from torch import nn -from torch.nn import Conv1d from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations from torch.utils.checkpoint import checkpoint -from fish_speech.models.vqgan.utils import sequence_mask + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ - if classname.find("Conv") != -1: + if classname.find("Conv1D") != -1: m.weight.data.normal_(mean, std) @@ -27,78 +28,141 @@ def get_padding(kernel_size, dilation=1): return (kernel_size * dilation - dilation) // 2 +def unpad1d(x: torch.Tensor, paddings: tuple[int, int]): + """Remove padding from x, handling properly zero padding. Only for 1d!""" + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + assert (padding_left + padding_right) <= x.shape[-1] + end = x.shape[-1] - padding_right + return x[..., padding_left:end] + + +def get_extra_padding_for_conv1d( + x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 +) -> int: + """See `pad_for_conv1d`.""" + length = x.shape[-1] + n_frames = (length - kernel_size + padding_total) / stride + 1 + ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) + return ideal_length - length + + +def pad1d( + x: torch.Tensor, + paddings: tuple[int, int], + mode: str = "zeros", + value: float = 0.0, +): + """Tiny wrapper around F.pad, just to allow for reflect padding on small input. + If this is the case, we insert extra 0 padding to the right + before the reflection happen. + """ + length = x.shape[-1] + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + if mode == "reflect": + max_pad = max(padding_left, padding_right) + extra_pad = 0 + if length <= max_pad: + extra_pad = max_pad - length + 1 + x = F.pad(x, (0, extra_pad)) + padded = F.pad(x, paddings, mode, value) + end = padded.shape[-1] - extra_pad + return padded[..., :end] + else: + return F.pad(x, paddings, mode, value) + + +class FishConvNet(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1 + ): + super(FishConvNet, self).__init__() + self.conv = nn.Conv1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + dilation=dilation, + groups=groups, + ) + self.stride = stride + self.kernel_size = (kernel_size - 1) * dilation + 1 + self.dilation = dilation + + def forward(self, x): + pad = self.kernel_size - self.stride + extra_padding = get_extra_padding_for_conv1d( + x, self.kernel_size, self.stride, pad + ) + x = pad1d(x, (pad, extra_padding), mode="constant", value=0) + return self.conv(x).contiguous() + + def weight_norm(self, name="weight", dim=0): + self.conv = weight_norm(self.conv, name=name, dim=dim) + return self + + def remove_weight_norm(self): + self.conv = remove_parametrizations(self.conv) + return self + + +class FishTransConvNet(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1): + super(FishTransConvNet, self).__init__() + self.conv = nn.ConvTranspose1d( + in_channels, out_channels, kernel_size, stride=stride, dilation=dilation + ) + self.stride = stride + self.kernel_size = kernel_size + + def forward(self, x): + x = self.conv(x) + pad = self.kernel_size - self.stride + padding_right = math.ceil(pad) + padding_left = pad - padding_right + x = unpad1d(x, (padding_left, padding_right)) + return x.contiguous() + + def weight_norm(self, name="weight", dim=0): + self.conv = weight_norm(self.conv, name=name, dim=dim) + return self + + def remove_weight_norm(self): + self.conv = remove_parametrizations(self.conv) + return self + + class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): super().__init__() self.convs1 = nn.ModuleList( [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]), - ) - ), + FishConvNet( + channels, channels, kernel_size, stride=1, dilation=dilation[0] + ).weight_norm(), + FishConvNet( + channels, channels, kernel_size, stride=1, dilation=dilation[1] + ).weight_norm(), + FishConvNet( + channels, channels, kernel_size, stride=1, dilation=dilation[2] + ).weight_norm(), ] ) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList( [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), + FishConvNet( + channels, channels, kernel_size, stride=1, dilation=dilation[0] + ).weight_norm(), + FishConvNet( + channels, channels, kernel_size, stride=1, dilation=dilation[1] + ).weight_norm(), + FishConvNet( + channels, channels, kernel_size, stride=1, dilation=dilation[2] + ).weight_norm(), ] ) self.convs2.apply(init_weights) @@ -119,7 +183,7 @@ def remove_parametrizations(self): remove_parametrizations(conv, tensor_name="weight") -class ParralelBlock(nn.Module): +class ParallelBlock(nn.Module): def __init__( self, channels: int, @@ -153,7 +217,6 @@ def __init__( resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)), num_mels: int = 128, upsample_initial_channel: int = 512, - use_template: bool = True, pre_conv_kernel_size: int = 7, post_conv_kernel_size: int = 7, post_activation: Callable = partial(nn.SiLU, inplace=True), @@ -164,85 +227,51 @@ def __init__( prod(upsample_rates) == hop_length ), f"hop_length must be {prod(upsample_rates)}" - self.conv_pre = weight_norm( - nn.Conv1d( - num_mels, - upsample_initial_channel, - pre_conv_kernel_size, - 1, - padding=get_padding(pre_conv_kernel_size), - ) - ) + self.conv_pre = FishConvNet( + num_mels, + upsample_initial_channel, + pre_conv_kernel_size, + stride=1, + ).weight_norm() self.num_upsamples = len(upsample_rates) self.num_kernels = len(resblock_kernel_sizes) self.noise_convs = nn.ModuleList() - self.use_template = use_template self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - c_cur = upsample_initial_channel // (2 ** (i + 1)) self.ups.append( - weight_norm( - nn.ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) + FishTransConvNet( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + stride=u, + ).weight_norm() ) - if not use_template: - continue - - if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1 :]) - self.noise_convs.append( - Conv1d( - 1, - c_cur, - kernel_size=stride_f0 * 2, - stride=stride_f0, - padding=stride_f0 // 2, - ) - ) - else: - self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) - self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) self.resblocks.append( - ParralelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes) + ParallelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes) ) self.activation_post = post_activation() - self.conv_post = weight_norm( - nn.Conv1d( - ch, - 1, - post_conv_kernel_size, - 1, - padding=get_padding(post_conv_kernel_size), - ) - ) + self.conv_post = FishConvNet( + ch, 1, post_conv_kernel_size, stride=1 + ).weight_norm() self.ups.apply(init_weights) self.conv_post.apply(init_weights) - def forward(self, x, template=None): + def forward(self, x): x = self.conv_pre(x) for i in range(self.num_upsamples): x = F.silu(x, inplace=True) x = self.ups[i](x) - if self.use_template: - x = x + self.noise_convs[i](template) - - if self.training: + if self.training and self.checkpointing: x = checkpoint( self.resblocks[i], x, @@ -364,11 +393,11 @@ def __init__( ): super().__init__() - self.dwconv = nn.Conv1d( + self.dwconv = FishConvNet( dim, dim, kernel_size=kernel_size, - padding=int(dilation * (kernel_size - 1) / 2), + # padding=int(dilation * (kernel_size - 1) / 2), groups=dim, ) # depthwise conv self.norm = LayerNorm(dim, eps=1e-6) @@ -421,12 +450,13 @@ def __init__( self.downsample_layers = nn.ModuleList() stem = nn.Sequential( - nn.Conv1d( + FishConvNet( input_channels, dims[0], - kernel_size=kernel_size, - padding=kernel_size // 2, - padding_mode="zeros", + kernel_size=7, + # padding=3, + # padding_mode="replicate", + # padding_mode="zeros", ), LayerNorm(dims[0], eps=1e-6, data_format="channels_first"), ) @@ -491,6 +521,7 @@ def __init__( self.head = head self.quantizer = quantizer self.spec_transform = spec_transform + self.downsample_factor = math.prod(self.quantizer.downsample_factor) def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor: if self.spec_transform is not None: @@ -512,7 +543,7 @@ def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor: if x.ndim == 2: x = x[:, None, :] - if self.quantizer is not None: + if self.vq is not None: return x, vq_result return x @@ -528,25 +559,30 @@ def encode(self, audios, audio_lengths): # Encode encoded_features = self.backbone(mels) * mel_masks_float_conv - feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor) + feature_lengths = mel_lengths // self.downsample_factor return self.quantizer.encode(encoded_features), feature_lengths def decode(self, indices, feature_lengths) -> torch.Tensor: - factor = math.prod(self.quantizer.downsample_factor) - mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor) + mel_masks = sequence_mask( + feature_lengths * self.downsample_factor, + indices.shape[2] * self.downsample_factor, + ) mel_masks_float_conv = mel_masks[:, None, :].float() + audio_lengths = ( + feature_lengths * self.downsample_factor * self.spec_transform.hop_length + ) audio_masks = sequence_mask( - feature_lengths * factor * self.spec_transform.hop_length, - indices.shape[2] * factor * self.spec_transform.hop_length, + audio_lengths, + indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length, ) audio_masks_float_conv = audio_masks[:, None, :].float() z = self.quantizer.decode(indices) * mel_masks_float_conv x = self.head(z) * audio_masks_float_conv - return x + return x, audio_lengths def remove_parametrizations(self): if hasattr(self.backbone, "remove_parametrizations"): @@ -558,68 +594,3 @@ def remove_parametrizations(self): @property def device(self): return next(self.parameters()).device - - -class FireflyBase(nn.Module): - def __init__(self, ckpt_path: str = None, pretrained: bool = True): - super().__init__() - - self.backbone = ConvNeXtEncoder( - input_channels=128, - depths=[3, 3, 9, 3], - dims=[128, 256, 384, 512], - drop_path_rate=0.2, - kernel_size=7, - ) - - self.head = HiFiGANGenerator( - hop_length=512, - upsample_rates=[8, 8, 2, 2, 2], - upsample_kernel_sizes=[16, 16, 4, 4, 4], - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - num_mels=512, - upsample_initial_channel=512, - use_template=False, - pre_conv_kernel_size=13, - post_conv_kernel_size=13, - ) - - if ckpt_path is not None: - state_dict = torch.load(ckpt_path, map_location="cpu") - elif pretrained: - state_dict = torch.hub.load_state_dict_from_url( - "https://github.com/fishaudio/vocoder/releases/download/1.0.0/firefly-gan-base-generator.ckpt", - map_location="cpu", - model_dir="checkpoints", - ) - - if "state_dict" in state_dict: - state_dict = state_dict["state_dict"] - - if any("generator." in k for k in state_dict): - state_dict = { - k.replace("generator.", ""): v - for k, v in state_dict.items() - if "generator." in k - } - - self.load_state_dict(state_dict, strict=True) - self.head.remove_parametrizations() - - @torch.no_grad() - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.backbone(x) - x = self.head(x) - if x.ndim == 2: - x = x[:, None, :] - return x - - -if __name__ == "__main__": - model = FireflyBase() - model.eval() - x = torch.randn(1, 128, 128) - with torch.no_grad(): - y = model(x) - print(y.shape) diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py index c837d6aee5..7ea4853376 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +++ b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py @@ -6,7 +6,7 @@ from einops import rearrange from vector_quantize_pytorch import GroupedResidualFSQ -from .firefly import ConvNeXtBlock +from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet @dataclass @@ -20,7 +20,7 @@ class DownsampleFiniteScalarQuantize(nn.Module): def __init__( self, input_dim: int = 512, - n_codebooks: int = 1, + n_codebooks: int = 9, n_groups: int = 1, levels: tuple[int] = (8, 5, 5, 5), # Approximate 2**10 downsample_factor: tuple[int] = (2, 2), @@ -46,7 +46,7 @@ def __init__( self.downsample = nn.Sequential( *[ nn.Sequential( - nn.Conv1d( + FishConvNet( all_dims[idx], all_dims[idx + 1], kernel_size=factor, @@ -61,7 +61,7 @@ def __init__( self.upsample = nn.Sequential( *[ nn.Sequential( - nn.ConvTranspose1d( + FishTransConvNet( all_dims[idx + 1], all_dims[idx], kernel_size=factor, @@ -114,26 +114,3 @@ def decode(self, indices: torch.Tensor): z_q = self.residual_fsq.get_output_from_indices(indices) z_q = self.upsample(z_q.mT) return z_q - - # def from_latents(self, latents: torch.Tensor): - # z_q, z_p, codes = super().from_latents(latents) - # z_q = self.upsample(z_q) - # return z_q, z_p, codes - - -if __name__ == "__main__": - rvq = DownsampleFiniteScalarQuantize( - n_codebooks=1, - downsample_factor=(2, 2), - ) - x = torch.randn(16, 512, 80) - - result = rvq(x) - print(rvq) - print(result.latents.shape, result.codes.shape, result.z.shape) - - # y = rvq.from_codes(result.codes) - # print(y[0].shape) - - # y = rvq.from_latents(result.latents) - # print(y[0].shape) diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py deleted file mode 100644 index 0d9c8c8359..0000000000 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import Optional - -import torch -import torch.nn.functional as F -from torch import nn - -from fish_speech.utils import autocast_exclude_mps - -from .wavenet import WaveNet - - -class ReferenceEncoder(WaveNet): - def __init__( - self, - input_channels: Optional[int] = None, - output_channels: Optional[int] = None, - residual_channels: int = 512, - residual_layers: int = 20, - dilation_cycle: Optional[int] = 4, - num_heads: int = 8, - latent_len: int = 4, - ): - super().__init__( - input_channels=input_channels, - residual_channels=residual_channels, - residual_layers=residual_layers, - dilation_cycle=dilation_cycle, - ) - - self.head_dim = residual_channels // num_heads - self.num_heads = num_heads - - self.latent_len = latent_len - self.latent = nn.Parameter(torch.zeros(1, self.latent_len, residual_channels)) - - self.q = nn.Linear(residual_channels, residual_channels, bias=True) - self.kv = nn.Linear(residual_channels, residual_channels * 2, bias=True) - self.q_norm = nn.LayerNorm(self.head_dim) - self.k_norm = nn.LayerNorm(self.head_dim) - self.proj = nn.Linear(residual_channels, residual_channels) - self.proj_drop = nn.Dropout(0.1) - - self.norm = nn.LayerNorm(residual_channels) - self.mlp = nn.Sequential( - nn.Linear(residual_channels, residual_channels * 4), - nn.SiLU(), - nn.Linear(residual_channels * 4, residual_channels), - ) - self.output_projection_attn = nn.Linear(residual_channels, output_channels) - - torch.nn.init.trunc_normal_(self.latent, std=0.02) - self.apply(self.init_weights) - - def init_weights(self, m): - if isinstance(m, nn.Linear): - torch.nn.init.trunc_normal_(m.weight, std=0.02) - if m.bias is not None: - torch.nn.init.constant_(m.bias, 0) - - def forward(self, x, attn_mask=None): - x = super().forward(x).mT - B, N, C = x.shape - - # Calculate mask - if attn_mask is not None: - assert attn_mask.shape == (B, N) and attn_mask.dtype == torch.bool - - attn_mask = attn_mask[:, None, None, :].expand( - B, self.num_heads, self.latent_len, N - ) - - q_latent = self.latent.expand(B, -1, -1) - q = ( - self.q(q_latent) - .reshape(B, self.latent_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - - kv = ( - self.kv(x) - .reshape(B, N, 2, self.num_heads, self.head_dim) - .permute(2, 0, 3, 1, 4) - ) - k, v = kv.unbind(0) - - q, k = self.q_norm(q), self.k_norm(k) - x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask) - - x = x.transpose(1, 2).reshape(B, self.latent_len, C) - x = self.proj(x) - x = self.proj_drop(x) - - x = x + self.mlp(self.norm(x)) - x = self.output_projection_attn(x) - x = x.mean(1) - - return x - - -if __name__ == "__main__": - with autocast_exclude_mps(device_type="cpu", dtype=torch.bfloat16): - model = ReferenceEncoder( - input_channels=128, - output_channels=64, - residual_channels=384, - residual_layers=20, - dilation_cycle=4, - num_heads=8, - ) - x = torch.randn(4, 128, 64) - mask = torch.ones(4, 64, dtype=torch.bool) - y = model(x, mask) - print(y.shape) - loss = F.mse_loss(y, torch.randn(4, 64)) - loss.backward() diff --git a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py b/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py deleted file mode 100644 index e7cc011c3e..0000000000 --- a/xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +++ /dev/null @@ -1,225 +0,0 @@ -import math -from typing import Optional - -import torch -import torch.nn.functional as F -from torch import nn - - -class Mish(nn.Module): - def forward(self, x): - return x * torch.tanh(F.softplus(x)) - - -class DiffusionEmbedding(nn.Module): - """Diffusion Step Embedding""" - - def __init__(self, d_denoiser): - super(DiffusionEmbedding, self).__init__() - self.dim = d_denoiser - - def forward(self, x): - device = x.device - half_dim = self.dim // 2 - emb = math.log(10000) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, device=device) * -emb) - emb = x[:, None] * emb[None, :] - emb = torch.cat((emb.sin(), emb.cos()), dim=-1) - return emb - - -class LinearNorm(nn.Module): - """LinearNorm Projection""" - - def __init__(self, in_features, out_features, bias=False): - super(LinearNorm, self).__init__() - self.linear = nn.Linear(in_features, out_features, bias) - - nn.init.xavier_uniform_(self.linear.weight) - if bias: - nn.init.constant_(self.linear.bias, 0.0) - - def forward(self, x): - x = self.linear(x) - return x - - -class ConvNorm(nn.Module): - """1D Convolution""" - - def __init__( - self, - in_channels, - out_channels, - kernel_size=1, - stride=1, - padding=None, - dilation=1, - bias=True, - w_init_gain="linear", - ): - super(ConvNorm, self).__init__() - - if padding is None: - assert kernel_size % 2 == 1 - padding = int(dilation * (kernel_size - 1) / 2) - - self.conv = nn.Conv1d( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - bias=bias, - ) - nn.init.kaiming_normal_(self.conv.weight) - - def forward(self, signal): - conv_signal = self.conv(signal) - - return conv_signal - - -class ResidualBlock(nn.Module): - """Residual Block""" - - def __init__( - self, - residual_channels, - use_linear_bias=False, - dilation=1, - condition_channels=None, - ): - super(ResidualBlock, self).__init__() - self.conv_layer = ConvNorm( - residual_channels, - 2 * residual_channels, - kernel_size=3, - stride=1, - padding=dilation, - dilation=dilation, - ) - - if condition_channels is not None: - self.diffusion_projection = LinearNorm( - residual_channels, residual_channels, use_linear_bias - ) - self.condition_projection = ConvNorm( - condition_channels, 2 * residual_channels, kernel_size=1 - ) - - self.output_projection = ConvNorm( - residual_channels, 2 * residual_channels, kernel_size=1 - ) - - def forward(self, x, condition=None, diffusion_step=None): - y = x - - if diffusion_step is not None: - diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1) - y = y + diffusion_step - - y = self.conv_layer(y) - - if condition is not None: - condition = self.condition_projection(condition) - y = y + condition - - gate, filter = torch.chunk(y, 2, dim=1) - y = torch.sigmoid(gate) * torch.tanh(filter) - - y = self.output_projection(y) - residual, skip = torch.chunk(y, 2, dim=1) - - return (x + residual) / math.sqrt(2.0), skip - - -class WaveNet(nn.Module): - def __init__( - self, - input_channels: Optional[int] = None, - output_channels: Optional[int] = None, - residual_channels: int = 512, - residual_layers: int = 20, - dilation_cycle: Optional[int] = 4, - is_diffusion: bool = False, - condition_channels: Optional[int] = None, - ): - super().__init__() - - # Input projection - self.input_projection = None - if input_channels is not None and input_channels != residual_channels: - self.input_projection = ConvNorm( - input_channels, residual_channels, kernel_size=1 - ) - - if input_channels is None: - input_channels = residual_channels - - self.input_channels = input_channels - - # Residual layers - self.residual_layers = nn.ModuleList( - [ - ResidualBlock( - residual_channels=residual_channels, - use_linear_bias=False, - dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1, - condition_channels=condition_channels, - ) - for i in range(residual_layers) - ] - ) - - # Skip projection - self.skip_projection = ConvNorm( - residual_channels, residual_channels, kernel_size=1 - ) - - # Output projection - self.output_projection = None - if output_channels is not None and output_channels != residual_channels: - self.output_projection = ConvNorm( - residual_channels, output_channels, kernel_size=1 - ) - - if is_diffusion: - self.diffusion_embedding = DiffusionEmbedding(residual_channels) - self.mlp = nn.Sequential( - LinearNorm(residual_channels, residual_channels * 4, False), - Mish(), - LinearNorm(residual_channels * 4, residual_channels, False), - ) - - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, (nn.Conv1d, nn.Linear)): - nn.init.trunc_normal_(m.weight, std=0.02) - if getattr(m, "bias", None) is not None: - nn.init.constant_(m.bias, 0) - - def forward(self, x, t=None, condition=None): - if self.input_projection is not None: - x = self.input_projection(x) - x = F.silu(x) - - if t is not None: - t = self.diffusion_embedding(t) - t = self.mlp(t) - - skip = [] - for layer in self.residual_layers: - x, skip_connection = layer(x, condition, t) - skip.append(skip_connection) - - x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers)) - x = self.skip_projection(x) - - if self.output_projection is not None: - x = F.silu(x) - x = self.output_projection(x) - - return x diff --git a/xinference/thirdparty/fish_speech/fish_speech/text/clean.py b/xinference/thirdparty/fish_speech/fish_speech/text/clean.py index 76d9dc9033..c228dfcd13 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/text/clean.py +++ b/xinference/thirdparty/fish_speech/fish_speech/text/clean.py @@ -1,61 +1,24 @@ -import itertools import re -LANGUAGE_UNICODE_RANGE_MAP = { - "ZH": [(0x4E00, 0x9FFF)], - "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)], - "EN": [(0x0000, 0x007F)], -} - SYMBOLS_MAPPING = { - ":": ",", - ";": ",", - ",": ",", - "。": ".", - "!": "!", - "?": "?", - "\n": ".", - "·": ",", - "、": ",", - "...": "…", "“": "'", "”": "'", "‘": "'", "’": "'", - "(": "'", - ")": "'", - "(": "'", - ")": "'", - "《": "'", - "》": "'", - "【": "'", - "】": "'", - "[": "'", - "]": "'", - "—": "-", - "~": "-", - "~": "-", - "・": "-", - "「": "'", - "」": "'", - ";": ",", - ":": ",", + "【": "", + "】": "", + "[": "", + "]": "", + "(": "", + ")": "", + "(": "", + ")": "", + "・": "·", } REPLACE_SYMBOL_REGEX = re.compile( "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) ) -ALL_KNOWN_UTF8_RANGE = list( - itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values()) -) -REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile( - "[^" - + "".join( - f"{re.escape(chr(start))}-{re.escape(chr(end))}" - for start, end in ALL_KNOWN_UTF8_RANGE - ) - + "]" -) def clean_text(text): @@ -64,6 +27,5 @@ def clean_text(text): # Replace all chinese symbols with their english counterparts text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) - text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text) return text diff --git a/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py b/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py index 5528cd3a63..d4bb995487 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +++ b/xinference/thirdparty/fish_speech/fish_speech/text/spliter.py @@ -71,9 +71,9 @@ def split_text(text, length): texts = [text] texts = map(protect_float, texts) - texts = break_text(texts, length, {".", "!", "?"}) + texts = break_text(texts, length, {".", "!", "?", "。", "!", "?"}) texts = map(unprotect_float, texts) - texts = break_text(texts, length, {","}) + texts = break_text(texts, length, {",", ","}) texts = break_text(texts, length, {" "}) texts = list(break_text_by_length(texts, length)) diff --git a/xinference/thirdparty/fish_speech/fish_speech/train.py b/xinference/thirdparty/fish_speech/fish_speech/train.py index a6a344097a..41b3642f88 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/train.py +++ b/xinference/thirdparty/fish_speech/fish_speech/train.py @@ -1,4 +1,6 @@ import os + +os.environ["USE_LIBUV"] = "0" import sys from typing import Optional diff --git a/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py b/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py index 9c183acd7c..4ec3fcac25 100644 --- a/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +++ b/xinference/thirdparty/fish_speech/fish_speech/webui/manage.py @@ -1,9 +1,11 @@ from __future__ import annotations +import os + +os.environ["USE_LIBUV"] = "0" import datetime import html import json -import os import platform import shutil import signal @@ -469,7 +471,7 @@ def train_process( "--config-name", "firefly_gan_vq", "--checkpoint-path", - "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ] ) @@ -485,7 +487,7 @@ def train_process( "16", ] ) - ckpt_path = "checkpoints/fish-speech-1.2-sft/model.pth" + ckpt_path = "checkpoints/fish-speech-1.4/model.pth" lora_prefix = "lora_" if llama_use_lora else "" llama_name = lora_prefix + "text2semantic_" + new_project latest = next( @@ -862,7 +864,7 @@ def llama_quantify(llama_weight, quantify_mode): minimum=1, maximum=32, step=1, - value=4, + value=2, ) llama_data_max_length_slider = gr.Slider( label=i18n("Maximum Length per Sample"), @@ -870,7 +872,7 @@ def llama_quantify(llama_weight, quantify_mode): minimum=1024, maximum=4096, step=128, - value=1024, + value=2048, ) with gr.Row(equal_height=False): llama_precision_dropdown = gr.Dropdown( @@ -925,9 +927,9 @@ def llama_quantify(llama_weight, quantify_mode): "Type the path or select from the dropdown" ), choices=[ - "checkpoints/fish-speech-1.2-sft/model.pth", + "checkpoints/fish-speech-1.4/model.pth", ], - value="checkpoints/fish-speech-1.2-sft/model.pth", + value="checkpoints/fish-speech-1.4/model.pth", allow_custom_value=True, interactive=True, ) @@ -979,7 +981,7 @@ def llama_quantify(llama_weight, quantify_mode): "Type the path or select from the dropdown" ), choices=list_llama_models(), - value="checkpoints/fish-speech-1.2-sft", + value="checkpoints/fish-speech-1.4", allow_custom_value=True, interactive=True, ) @@ -1042,7 +1044,7 @@ def llama_quantify(llama_weight, quantify_mode): "Type the path or select from the dropdown" ), choices=list_decoder_models(), - value="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + value="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", allow_custom_value=True, ) infer_decoder_config = gr.Dropdown( @@ -1060,7 +1062,7 @@ def llama_quantify(llama_weight, quantify_mode): info=i18n( "Type the path or select from the dropdown" ), - value="checkpoints/fish-speech-1.2-sft", + value="checkpoints/fish-speech-1.4", choices=list_llama_models(), allow_custom_value=True, ) diff --git a/xinference/thirdparty/fish_speech/tools/api.py b/xinference/thirdparty/fish_speech/tools/api.py index 29869b267f..7fcc9330ae 100644 --- a/xinference/thirdparty/fish_speech/tools/api.py +++ b/xinference/thirdparty/fish_speech/tools/api.py @@ -9,16 +9,20 @@ from argparse import ArgumentParser from http import HTTPStatus from pathlib import Path -from typing import Annotated, Literal, Optional +from typing import Annotated, Any, Literal, Optional import numpy as np +import ormsgpack # import pyrootutils import soundfile as sf import torch import torchaudio +# from baize.datastructures import ContentType # from kui.asgi import ( # Body, +# FactoryClass, # HTTPException, +# HttpRequest, # HttpView, # JSONResponse, # Kui, @@ -27,14 +31,16 @@ # ) # from kui.asgi.routing import MultimethodRoutes from loguru import logger -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, conint # pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) # from fish_speech.models.vqgan.lit_module import VQGAN from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture +from fish_speech.text.chn_text_norm.text import Text as ChnNormedText from fish_speech.utils import autocast_exclude_mps -# from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model +from tools.commons import ServeReferenceAudio, ServeTTSRequest +from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text from tools.llama.generate import ( GenerateRequest, GenerateResponse, @@ -82,11 +88,8 @@ async def other_exception_handler(exc: "Exception"): def load_audio(reference_audio, sr): if len(reference_audio) > 255 or not Path(reference_audio).exists(): - try: - audio_data = base64.b64decode(reference_audio) - reference_audio = io.BytesIO(audio_data) - except base64.binascii.Error: - raise ValueError("Invalid path or base64 string") + audio_data = reference_audio + reference_audio = io.BytesIO(audio_data) waveform, original_sr = torchaudio.load( reference_audio, backend="sox" if sys.platform == "linux" else "soundfile" @@ -145,7 +148,7 @@ def decode_vq_tokens( return decoder_model.decode( indices=codes[None], feature_lengths=feature_lengths, - ).squeeze() + )[0].squeeze() raise ValueError(f"Unknown model type: {type(decoder_model)}") @@ -153,58 +156,6 @@ def decode_vq_tokens( # routes = MultimethodRoutes(base_class=HttpView) -def get_random_paths(base_path, data, speaker, emotion): - if base_path and data and speaker and emotion and (Path(base_path).exists()): - if speaker in data and emotion in data[speaker]: - files = data[speaker][emotion] - lab_files = [f for f in files if f.endswith(".lab")] - wav_files = [f for f in files if f.endswith(".wav")] - - if lab_files and wav_files: - selected_lab = random.choice(lab_files) - selected_wav = random.choice(wav_files) - - lab_path = Path(base_path) / speaker / emotion / selected_lab - wav_path = Path(base_path) / speaker / emotion / selected_wav - if lab_path.exists() and wav_path.exists(): - return lab_path, wav_path - - return None, None - - -def load_json(json_file): - if not json_file: - logger.info("Not using a json file") - return None - try: - with open(json_file, "r", encoding="utf-8") as file: - data = json.load(file) - except FileNotFoundError: - logger.warning(f"ref json not found: {json_file}") - data = None - except Exception as e: - logger.warning(f"Loading json failed: {e}") - data = None - return data - - -class InvokeRequest(BaseModel): - text: str = "你说的对, 但是原神是一款由米哈游自主研发的开放世界手游." - reference_text: Optional[str] = None - reference_audio: Optional[str] = None - max_new_tokens: int = 1024 - chunk_length: Annotated[int, Field(ge=0, le=500, strict=True)] = 100 - top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7 - repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2 - temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7 - emotion: Optional[str] = None - format: Literal["wav", "mp3", "flac"] = "wav" - streaming: bool = False - ref_json: Optional[str] = "ref_data.json" - ref_base: Optional[str] = "ref_data" - speaker: Optional[str] = None - - def get_content_type(audio_format): if audio_format == "wav": return "audio/wav" @@ -217,35 +168,52 @@ def get_content_type(audio_format): @torch.inference_mode() -def inference(req: InvokeRequest): - # Parse reference audio aka prompt - prompt_tokens = None - - ref_data = load_json(req.ref_json) - ref_base = req.ref_base - - lab_path, wav_path = get_random_paths(ref_base, ref_data, req.speaker, req.emotion) - - if lab_path and wav_path: - with open(lab_path, "r", encoding="utf-8") as lab_file: - ref_text = lab_file.read() - req.reference_audio = wav_path - req.reference_text = ref_text - logger.info("ref_path: " + str(wav_path)) - logger.info("ref_text: " + ref_text) - - # Parse reference audio aka prompt - prompt_tokens = encode_reference( - decoder_model=decoder_model, - reference_audio=req.reference_audio, - enable_reference_audio=req.reference_audio is not None, - ) - logger.info(f"ref_text: {req.reference_text}") +def inference(req: ServeTTSRequest): + + idstr: str | None = req.reference_id + if idstr is not None: + ref_folder = Path("references") / idstr + ref_folder.mkdir(parents=True, exist_ok=True) + ref_audios = list_files( + ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False + ) + prompt_tokens = [ + encode_reference( + decoder_model=decoder_model, + reference_audio=audio_to_bytes(str(ref_audio)), + enable_reference_audio=True, + ) + for ref_audio in ref_audios + ] + prompt_texts = [ + read_ref_text(str(ref_audio.with_suffix(".lab"))) + for ref_audio in ref_audios + ] + + else: + # Parse reference audio aka prompt + refs = req.references + if refs is None: + refs = [] + prompt_tokens = [ + encode_reference( + decoder_model=decoder_model, + reference_audio=ref.audio, + enable_reference_audio=True, + ) + for ref in refs + ] + prompt_texts = [ref.text for ref in refs] + # LLAMA Inference request = dict( device=decoder_model.device, max_new_tokens=req.max_new_tokens, - text=req.text, + text=( + req.text + if not req.normalize + else ChnNormedText(raw_text=req.text).normalize() + ), top_p=req.top_p, repetition_penalty=req.repetition_penalty, temperature=req.temperature, @@ -254,7 +222,7 @@ def inference(req: InvokeRequest): chunk_length=req.chunk_length, max_length=2048, prompt_tokens=prompt_tokens, - prompt_text=req.reference_text, + prompt_text=prompt_texts, ) response_queue = queue.Queue() @@ -307,40 +275,7 @@ def inference(req: InvokeRequest): yield fake_audios -def auto_rerank_inference(req: InvokeRequest, use_auto_rerank: bool = True): - if not use_auto_rerank: - # 如果不使用 auto_rerank,直接调用原始的 inference 函数 - return inference(req) - - zh_model, en_model = load_model() - max_attempts = 5 - best_wer = float("inf") - best_audio = None - - for attempt in range(max_attempts): - # 调用原始的 inference 函数 - audio_generator = inference(req) - fake_audios = next(audio_generator) - - asr_result = batch_asr( - zh_model if is_chinese(req.text) else en_model, [fake_audios], 44100 - )[0] - wer = calculate_wer(req.text, asr_result["text"]) - - if wer <= 0.1 and not asr_result["huge_gap"]: - return fake_audios - - if wer < best_wer: - best_wer = wer - best_audio = fake_audios - - if attempt == max_attempts - 1: - break - - return best_audio - - -async def inference_async(req: InvokeRequest): +async def inference_async(req: ServeTTSRequest): for chunk in inference(req): yield chunk @@ -349,9 +284,9 @@ async def buffer_to_async_generator(buffer): yield buffer -# @routes.http.post("/v1/invoke") +# @routes.http.post("/v1/tts") # async def api_invoke_model( -# req: Annotated[InvokeRequest, Body(exclusive=True)], +# req: Annotated[ServeTTSRequest, Body(exclusive=True)], # ): # """ # Invoke model and generate audio @@ -410,21 +345,20 @@ def parse_args(): parser.add_argument( "--llama-checkpoint-path", type=str, - default="checkpoints/fish-speech-1.2-sft", + default="checkpoints/fish-speech-1.4", ) parser.add_argument( "--decoder-checkpoint-path", type=str, - default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ) parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq") parser.add_argument("--device", type=str, default="cuda") parser.add_argument("--half", action="store_true") parser.add_argument("--compile", action="store_true") parser.add_argument("--max-text-length", type=int, default=0) - parser.add_argument("--listen", type=str, default="127.0.0.1:8000") + parser.add_argument("--listen", type=str, default="127.0.0.1:8080") parser.add_argument("--workers", type=int, default=1) - parser.add_argument("--use-auto-rerank", type=bool, default=True) return parser.parse_args() @@ -436,18 +370,30 @@ def parse_args(): # }, # ).routes # +# +# class MsgPackRequest(HttpRequest): +# async def data(self) -> Annotated[Any, ContentType("application/msgpack")]: +# if self.content_type == "application/msgpack": +# return ormsgpack.unpackb(await self.body) +# +# raise HTTPException( +# HTTPStatus.UNSUPPORTED_MEDIA_TYPE, +# headers={"Accept": "application/msgpack"}, +# ) +# +# # app = Kui( # routes=routes + openapi[1:], # Remove the default route # exception_handlers={ # HTTPException: http_execption_handler, # Exception: other_exception_handler, # }, +# factory_class=FactoryClass(http=MsgPackRequest), # cors_config={}, # ) if __name__ == "__main__": - import threading import uvicorn @@ -474,18 +420,17 @@ def parse_args(): # Dry run to check if the model is loaded correctly and avoid the first-time latency list( inference( - InvokeRequest( + ServeTTSRequest( text="Hello world.", - reference_text=None, - reference_audio=None, - max_new_tokens=0, + references=[], + reference_id=None, + max_new_tokens=1024, + chunk_length=200, top_p=0.7, repetition_penalty=1.2, temperature=0.7, emotion=None, format="wav", - ref_base=None, - ref_json=None, ) ) ) diff --git a/xinference/thirdparty/fish_speech/tools/auto_rerank.py b/xinference/thirdparty/fish_speech/tools/auto_rerank.py deleted file mode 100644 index 0297d63d77..0000000000 --- a/xinference/thirdparty/fish_speech/tools/auto_rerank.py +++ /dev/null @@ -1,159 +0,0 @@ -import os - -os.environ["MODELSCOPE_CACHE"] = ".cache/" - -import string -import time -from threading import Lock - -import librosa -import numpy as np -import opencc -import torch -from faster_whisper import WhisperModel - -t2s_converter = opencc.OpenCC("t2s") - - -def load_model(*, device="cuda"): - model = WhisperModel( - "medium", - device=device, - compute_type="float16", - download_root="faster_whisper", - ) - print("faster_whisper loaded!") - return model - - -@torch.no_grad() -def batch_asr_internal(model: WhisperModel, audios, sr): - resampled_audios = [] - for audio in audios: - - if isinstance(audio, np.ndarray): - audio = torch.from_numpy(audio).float() - - if audio.dim() > 1: - audio = audio.squeeze() - - assert audio.dim() == 1 - audio_np = audio.numpy() - resampled_audio = librosa.resample(audio_np, orig_sr=sr, target_sr=16000) - resampled_audios.append(resampled_audio) - - trans_results = [] - - for resampled_audio in resampled_audios: - segments, info = model.transcribe( - resampled_audio, - language=None, - beam_size=5, - initial_prompt="Punctuation is needed in any language.", - ) - trans_results.append(list(segments)) - - results = [] - for trans_res, audio in zip(trans_results, audios): - - duration = len(audio) / sr * 1000 - huge_gap = False - max_gap = 0.0 - - text = None - last_tr = None - - for tr in trans_res: - delta = tr.text.strip() - if tr.id > 1: - max_gap = max(tr.start - last_tr.end, max_gap) - text += delta - else: - text = delta - - last_tr = tr - if max_gap > 3.0: - huge_gap = True - break - - sim_text = t2s_converter.convert(text) - results.append( - { - "text": sim_text, - "duration": duration, - "huge_gap": huge_gap, - } - ) - - return results - - -global_lock = Lock() - - -def batch_asr(model, audios, sr): - return batch_asr_internal(model, audios, sr) - - -def is_chinese(text): - return True - - -def calculate_wer(text1, text2, debug=False): - chars1 = remove_punctuation(text1) - chars2 = remove_punctuation(text2) - - m, n = len(chars1), len(chars2) - - if m > n: - chars1, chars2 = chars2, chars1 - m, n = n, m - - prev = list(range(m + 1)) # row 0 distance: [0, 1, 2, ...] - curr = [0] * (m + 1) - - for j in range(1, n + 1): - curr[0] = j - for i in range(1, m + 1): - if chars1[i - 1] == chars2[j - 1]: - curr[i] = prev[i - 1] - else: - curr[i] = min(prev[i], curr[i - 1], prev[i - 1]) + 1 - prev, curr = curr, prev - - edits = prev[m] - tot = max(len(chars1), len(chars2)) - wer = edits / tot - - if debug: - print(" gt: ", chars1) - print(" pred: ", chars2) - print(" edits/tot = wer: ", edits, "/", tot, "=", wer) - - return wer - - -def remove_punctuation(text): - chinese_punctuation = ( - " \n\t”“!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—" - '‛""„‟…‧﹏' - ) - all_punctuation = string.punctuation + chinese_punctuation - translator = str.maketrans("", "", all_punctuation) - text_without_punctuation = text.translate(translator) - return text_without_punctuation - - -if __name__ == "__main__": - model = load_model() - audios = [ - librosa.load("44100.wav", sr=44100)[0], - librosa.load("lengyue.wav", sr=44100)[0], - ] - print(np.array(audios[0])) - print(batch_asr(model, audios, 44100)) - - start_time = time.time() - for _ in range(10): - print(batch_asr(model, audios, 44100)) - print("Time taken:", time.time() - start_time) diff --git a/xinference/thirdparty/fish_speech/tools/commons.py b/xinference/thirdparty/fish_speech/tools/commons.py new file mode 100644 index 0000000000..f81cadec1e --- /dev/null +++ b/xinference/thirdparty/fish_speech/tools/commons.py @@ -0,0 +1,35 @@ +from typing import Annotated, Literal, Optional + +from pydantic import BaseModel, Field, conint + + +class ServeReferenceAudio(BaseModel): + audio: bytes + text: str + + +class ServeTTSRequest(BaseModel): + text: str + chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200 + # Audio format + format: Literal["wav", "pcm", "mp3"] = "wav" + mp3_bitrate: Literal[64, 128, 192] = 128 + # References audios for in-context learning + references: list[ServeReferenceAudio] = [] + # Reference id + # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/ + # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1 + reference_id: str | None = None + # Normalize text for en & zh, this increase stability for numbers + normalize: bool = True + mp3_bitrate: Optional[int] = 64 + opus_bitrate: Optional[int] = -1000 + # Balance mode will reduce latency to 300ms, but may decrease stability + latency: Literal["normal", "balanced"] = "normal" + # not usually used below + streaming: bool = False + emotion: Optional[str] = None + max_new_tokens: int = 1024 + top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7 + repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2 + temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7 diff --git a/xinference/thirdparty/fish_speech/tools/download_models.py b/xinference/thirdparty/fish_speech/tools/download_models.py index 480f3be0f4..9e79c34c43 100644 --- a/xinference/thirdparty/fish_speech/tools/download_models.py +++ b/xinference/thirdparty/fish_speech/tools/download_models.py @@ -22,8 +22,8 @@ def check_and_download_files(repo_id, file_list, local_dir): # 1st -repo_id_1 = "fishaudio/fish-speech-1.2-sft" -local_dir_1 = "./checkpoints/fish-speech-1.2-sft" +repo_id_1 = "fishaudio/fish-speech-1.4" +local_dir_1 = "./checkpoints/fish-speech-1.4" files_1 = [ "model.pth", "README.md", @@ -31,7 +31,7 @@ def check_and_download_files(repo_id, file_list, local_dir): "tokenizer_config.json", "tokenizer.json", "config.json", - "firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + "firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ] # 3rd diff --git a/xinference/thirdparty/fish_speech/tools/file.py b/xinference/thirdparty/fish_speech/tools/file.py index b4b8051d6f..f7a0597365 100644 --- a/xinference/thirdparty/fish_speech/tools/file.py +++ b/xinference/thirdparty/fish_speech/tools/file.py @@ -1,3 +1,4 @@ +import base64 from pathlib import Path from typing import Union @@ -23,6 +24,22 @@ } +def audio_to_bytes(file_path): + if not file_path or not Path(file_path).exists(): + return None + with open(file_path, "rb") as wav_file: + wav = wav_file.read() + return wav + + +def read_ref_text(ref_text): + path = Path(ref_text) + if path.exists() and path.is_file(): + with path.open("r", encoding="utf-8") as file: + return file.read() + return ref_text + + def list_files( path: Union[Path, str], extensions: set[str] = None, diff --git a/xinference/thirdparty/fish_speech/tools/gen_ref.py b/xinference/thirdparty/fish_speech/tools/gen_ref.py deleted file mode 100644 index a771903b02..0000000000 --- a/xinference/thirdparty/fish_speech/tools/gen_ref.py +++ /dev/null @@ -1,36 +0,0 @@ -import json -from pathlib import Path - - -def scan_folder(base_path): - wav_lab_pairs = {} - - base = Path(base_path) - for suf in ["wav", "lab"]: - for f in base.rglob(f"*.{suf}"): - relative_path = f.relative_to(base) - parts = relative_path.parts - print(parts) - if len(parts) >= 3: - character = parts[0] - emotion = parts[1] - - if character not in wav_lab_pairs: - wav_lab_pairs[character] = {} - if emotion not in wav_lab_pairs[character]: - wav_lab_pairs[character][emotion] = [] - wav_lab_pairs[character][emotion].append(str(f.name)) - - return wav_lab_pairs - - -def save_to_json(data, output_file): - with open(output_file, "w", encoding="utf-8") as file: - json.dump(data, file, ensure_ascii=False, indent=2) - - -base_path = "ref_data" -out_ref_file = "ref_data.json" - -wav_lab_pairs = scan_folder(base_path) -save_to_json(wav_lab_pairs, out_ref_file) diff --git a/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py b/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py index 20e2219956..fc5ef120cc 100644 --- a/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +++ b/xinference/thirdparty/fish_speech/tools/llama/build_dataset.py @@ -13,7 +13,7 @@ from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData from fish_speech.datasets.protos.text_data_stream import pack_pb_stream -from fish_speech.utils.file import load_filelist +from tools.file import load_filelist # To avoid CPU overload os.environ["MKL_NUM_THREADS"] = "1" diff --git a/xinference/thirdparty/fish_speech/tools/llama/generate.py b/xinference/thirdparty/fish_speech/tools/llama/generate.py index 934c185145..ad9c549996 100644 --- a/xinference/thirdparty/fish_speech/tools/llama/generate.py +++ b/xinference/thirdparty/fish_speech/tools/llama/generate.py @@ -2,6 +2,7 @@ import queue import threading import time +from contextlib import nullcontext from dataclasses import dataclass from pathlib import Path from typing import Literal, Optional, Tuple, Union @@ -93,15 +94,20 @@ def decode_one_token_ar( **sampling_kwargs, ) -> torch.Tensor: x = model.forward_generate(x, input_pos) + + sampling_kwargs_main = sampling_kwargs.copy() + sampling_kwargs_main["temperature"] = 0.1 + sampling_kwargs_main["top_p"] = 0.1 + sampling_kwargs_main["repetition_penalty"] = 1.0 + codebooks = [ sample( x.logits, - previous_tokens=( - previous_tokens[0] if previous_tokens is not None else None - ), # Disable repetition penalty for the token codebook - **sampling_kwargs, + previous_tokens=None, # Disable repetition penalty for the token codebook + **sampling_kwargs_main, )[0] ] + x = x.hidden_states # Cleanup the cache @@ -136,11 +142,16 @@ def decode_one_token_naive( ) -> torch.Tensor: x = model.forward_generate(x, input_pos) + sampling_kwargs_main = sampling_kwargs.copy() + sampling_kwargs_main["temperature"] = 0.1 + sampling_kwargs_main["top_p"] = 0.1 + sampling_kwargs_main["repetition_penalty"] = 1.0 + codebooks = [ sample( - x.token_logits, + x.logits, previous_tokens=None, # Disable repetition penalty for the token codebook - **sampling_kwargs, + **sampling_kwargs_main, )[0] ] @@ -181,8 +192,12 @@ def decode_n_tokens( else: window = previous_tokens[:, i - win_size : i] - with torch.backends.cuda.sdp_kernel( - enable_flash=False, enable_mem_efficient=False, enable_math=True + with ( + torch.backends.cuda.sdp_kernel( + enable_flash=False, enable_mem_efficient=False, enable_math=True + ) + if torch.cuda.is_available() + else nullcontext() ): # Actually better for Inductor to codegen attention here next_token = decode_one_token( model=model, @@ -222,25 +237,11 @@ def generate( # create an empty tensor of the expected final shape and fill in the current tokens T = prompt.size(1) - if max_new_tokens: - if T + max_new_tokens > model.config.max_seq_len: - max_new_tokens = model.config.max_seq_len - T - logger.info(f"Truncating max_new_tokens to {max_new_tokens}") - - T_new = T + max_new_tokens - else: - T_new = model.config.max_seq_len - max_new_tokens = T_new - T - device, dtype = prompt.device, prompt.dtype - with torch.device(device): - model.setup_caches( - max_batch_size=1, max_seq_len=T_new, dtype=next(model.parameters()).dtype - ) codebook_dim = 1 + model.config.num_codebooks # create an empty tensor of the expected final shape and fill in the current tokens - empty = torch.empty((codebook_dim, T_new), dtype=dtype, device=device) + empty = torch.empty((codebook_dim, max_new_tokens), dtype=dtype, device=device) empty[:, :T] = prompt seq = empty input_pos = torch.arange(0, T, device=device) @@ -560,6 +561,10 @@ def worker(): model, decode_one_token = load_model( checkpoint_path, device, precision, compile=compile ) + with torch.device(device): + model.setup_caches( + max_batch_size=1, max_seq_len=2048, dtype=next(model.parameters()).dtype + ) init_event.set() while True: @@ -607,7 +612,7 @@ def worker(): @click.option( "--checkpoint-path", type=click.Path(path_type=Path, exists=True), - default="checkpoints/fish-speech-1.2-sft", + default="checkpoints/fish-speech-1.4", ) @click.option("--device", type=str, default="cuda") @click.option("--compile/--no-compile", default=False) diff --git a/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py b/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py index f12eece8d2..c1bd3cbd72 100644 --- a/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +++ b/xinference/thirdparty/fish_speech/tools/llama/merge_lora.py @@ -15,7 +15,7 @@ @click.command() @click.option("--lora-config", type=str, default="r_8_alpha_16") -@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.2-sft") +@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.4") @click.option("--lora-weight", type=str, required=True) @click.option("--output", type=str, required=True) def merge(lora_config, base_weight, lora_weight, output): diff --git a/xinference/thirdparty/fish_speech/tools/llama/quantize.py b/xinference/thirdparty/fish_speech/tools/llama/quantize.py index aae32fcce7..e629d944b5 100644 --- a/xinference/thirdparty/fish_speech/tools/llama/quantize.py +++ b/xinference/thirdparty/fish_speech/tools/llama/quantize.py @@ -428,7 +428,7 @@ def generate_folder_name(): @click.option( "--checkpoint-path", type=click.Path(path_type=Path, exists=True), - default="checkpoints/fish-speech-1.2-sft", + default="checkpoints/fish-speech-1.4", ) @click.option( "--mode", type=str, default="int8", help="type of quantization to perform" @@ -451,7 +451,7 @@ def quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) - precision=precision, compile=False, ) - vq_model = "firefly-gan-vq-fsq-4x1024-42hz-generator.pth" + vq_model = "firefly-gan-vq-fsq-8x1024-21hz-generator.pth" now = timestamp if timestamp != "None" else generate_folder_name() if mode == "int8": diff --git a/xinference/thirdparty/fish_speech/tools/merge_asr_files.py b/xinference/thirdparty/fish_speech/tools/merge_asr_files.py deleted file mode 100644 index cc12062095..0000000000 --- a/xinference/thirdparty/fish_speech/tools/merge_asr_files.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -from pathlib import Path - -from pydub import AudioSegment -from tqdm import tqdm - -from tools.file import AUDIO_EXTENSIONS, list_files - - -def merge_and_delete_files(save_dir, original_files): - save_path = Path(save_dir) - audio_slice_files = list_files( - path=save_dir, extensions=AUDIO_EXTENSIONS.union([".lab"]), recursive=True - ) - audio_files = {} - label_files = {} - for file_path in tqdm(audio_slice_files, desc="Merging audio files"): - rel_path = Path(file_path).relative_to(save_path) - (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True) - if file_path.suffix == ".wav": - prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0] - if prefix == rel_path.parent / file_path.stem: - continue - audio = AudioSegment.from_wav(file_path) - if prefix in audio_files.keys(): - audio_files[prefix] = audio_files[prefix] + audio - else: - audio_files[prefix] = audio - - elif file_path.suffix == ".lab": - prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0] - if prefix == rel_path.parent / file_path.stem: - continue - with open(file_path, "r", encoding="utf-8") as f: - label = f.read() - if prefix in label_files.keys(): - label_files[prefix] = label_files[prefix] + ", " + label - else: - label_files[prefix] = label - - for prefix, audio in audio_files.items(): - output_audio_path = save_path / f"{prefix}.wav" - audio.export(output_audio_path, format="wav") - - for prefix, label in label_files.items(): - output_label_path = save_path / f"{prefix}.lab" - with open(output_label_path, "w", encoding="utf-8") as f: - f.write(label) - - for file_path in original_files: - os.remove(file_path) - - -if __name__ == "__main__": - merge_and_delete_files("/made/by/spicysama/laziman", [__file__]) diff --git a/xinference/thirdparty/fish_speech/tools/msgpack_api.py b/xinference/thirdparty/fish_speech/tools/msgpack_api.py new file mode 100644 index 0000000000..67f907bf55 --- /dev/null +++ b/xinference/thirdparty/fish_speech/tools/msgpack_api.py @@ -0,0 +1,34 @@ +import httpx +import ormsgpack + +from tools.commons import ServeReferenceAudio, ServeTTSRequest + +# priority: ref_id > references +request = ServeTTSRequest( + text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.", + # reference_id="114514", + references=[ + ServeReferenceAudio( + audio=open("lengyue.wav", "rb").read(), + text=open("lengyue.lab", "r", encoding="utf-8").read(), + ) + ], + streaming=True, +) + +with ( + httpx.Client() as client, + open("hello.wav", "wb") as f, +): + with client.stream( + "POST", + "http://127.0.0.1:8080/v1/tts", + content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC), + headers={ + "authorization": "Bearer YOUR_API_KEY", + "content-type": "application/msgpack", + }, + timeout=None, + ) as response: + for chunk in response.iter_bytes(): + f.write(chunk) diff --git a/xinference/thirdparty/fish_speech/tools/post_api.py b/xinference/thirdparty/fish_speech/tools/post_api.py index 153893078e..c20dc455c3 100644 --- a/xinference/thirdparty/fish_speech/tools/post_api.py +++ b/xinference/thirdparty/fish_speech/tools/post_api.py @@ -1,40 +1,19 @@ import argparse import base64 -import json import wave -from pathlib import Path +import ormsgpack import pyaudio import requests +from pydub import AudioSegment +from pydub.playback import play +from tools.commons import ServeReferenceAudio, ServeTTSRequest +from tools.file import audio_to_bytes, read_ref_text -def wav_to_base64(file_path): - if not file_path or not Path(file_path).exists(): - return None - with open(file_path, "rb") as wav_file: - wav_content = wav_file.read() - base64_encoded = base64.b64encode(wav_content) - return base64_encoded.decode("utf-8") +def parse_args(): -def read_ref_text(ref_text): - path = Path(ref_text) - if path.exists() and path.is_file(): - with path.open("r", encoding="utf-8") as file: - return file.read() - return ref_text - - -def play_audio(audio_content, format, channels, rate): - p = pyaudio.PyAudio() - stream = p.open(format=format, channels=channels, rate=rate, output=True) - stream.write(audio_content) - stream.stop_stream() - stream.close() - p.terminate() - - -if __name__ == "__main__": parser = argparse.ArgumentParser( description="Send a WAV file and text to a server and receive synthesized audio." ) @@ -43,16 +22,24 @@ def play_audio(audio_content, format, channels, rate): "--url", "-u", type=str, - default="http://127.0.0.1:8080/v1/invoke", + default="http://127.0.0.1:8080/v1/tts", help="URL of the server", ) parser.add_argument( "--text", "-t", type=str, required=True, help="Text to be synthesized" ) + parser.add_argument( + "--reference_id", + "-id", + type=str, + default=None, + help="ID of the reference model o be used for the speech", + ) parser.add_argument( "--reference_audio", "-ra", type=str, + nargs="+", default=None, help="Path to the WAV file", ) @@ -60,9 +47,30 @@ def play_audio(audio_content, format, channels, rate): "--reference_text", "-rt", type=str, + nargs="+", default=None, help="Reference text for voice synthesis", ) + parser.add_argument( + "--output", + "-o", + type=str, + default="generated_audio", + help="Output audio file name", + ) + parser.add_argument( + "--play", + type=bool, + default=True, + help="Whether to play audio after receiving data", + ) + parser.add_argument("--normalize", type=bool, default=True) + parser.add_argument( + "--format", type=str, choices=["wav", "mp3", "flac"], default="wav" + ) + parser.add_argument("--mp3_bitrate", type=int, default=64) + parser.add_argument("--opus_bitrate", type=int, default=-1000) + parser.add_argument("--latency", type=str, default="normal", help="延迟选项") parser.add_argument( "--max_new_tokens", type=int, @@ -88,7 +96,6 @@ def play_audio(audio_content, format, channels, rate): "--speaker", type=str, default=None, help="Speaker ID for voice synthesis" ) parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion") - parser.add_argument("--format", type=str, default="wav", help="Audio format") parser.add_argument( "--streaming", type=bool, default=False, help="Enable streaming response" ) @@ -97,18 +104,42 @@ def play_audio(audio_content, format, channels, rate): ) parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio") - args = parser.parse_args() + return parser.parse_args() - base64_audio = wav_to_base64(args.reference_audio) - ref_text = args.reference_text - if ref_text: - ref_text = read_ref_text(ref_text) +if __name__ == "__main__": + + args = parse_args() + + idstr: str | None = args.reference_id + # priority: ref_id > [{text, audio},...] + if idstr is None: + ref_audios = args.reference_audio + ref_texts = args.reference_text + if ref_audios is None: + byte_audios = [] + else: + byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios] + if ref_texts is None: + ref_texts = [] + else: + ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts] + else: + byte_audios = [] + ref_texts = [] + pass # in api.py data = { "text": args.text, - "reference_text": ref_text, - "reference_audio": base64_audio, + "references": [ + ServeReferenceAudio(audio=ref_audio, text=ref_text) + for ref_text, ref_audio in zip(ref_texts, byte_audios) + ], + "reference_id": idstr, + "normalize": args.normalize, + "format": args.format, + "mp3_bitrate": args.mp3_bitrate, + "opus_bitrate": args.opus_bitrate, "max_new_tokens": args.max_new_tokens, "chunk_length": args.chunk_length, "top_p": args.top_p, @@ -116,22 +147,30 @@ def play_audio(audio_content, format, channels, rate): "temperature": args.temperature, "speaker": args.speaker, "emotion": args.emotion, - "format": args.format, "streaming": args.streaming, } - response = requests.post(args.url, json=data, stream=args.streaming) + pydantic_data = ServeTTSRequest(**data) - audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format + response = requests.post( + args.url, + data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC), + stream=args.streaming, + headers={ + "authorization": "Bearer YOUR_API_KEY", + "content-type": "application/msgpack", + }, + ) if response.status_code == 200: if args.streaming: p = pyaudio.PyAudio() + audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format stream = p.open( format=audio_format, channels=args.channels, rate=args.rate, output=True ) - wf = wave.open("generated_audio.wav", "wb") + wf = wave.open(f"{args.output}.wav", "wb") wf.setnchannels(args.channels) wf.setsampwidth(p.get_sample_size(audio_format)) wf.setframerate(args.rate) @@ -153,12 +192,14 @@ def play_audio(audio_content, format, channels, rate): wf.close() else: audio_content = response.content - - with open("generated_audio.wav", "wb") as audio_file: + audio_path = f"{args.output}.{args.format}" + with open(audio_path, "wb") as audio_file: audio_file.write(audio_content) - play_audio(audio_content, audio_format, args.channels, args.rate) - print("Audio has been saved to 'generated_audio.wav'.") + audio = AudioSegment.from_file(audio_path, format=args.format) + if args.play: + play(audio) + print(f"Audio has been saved to '{audio_path}'.") else: print(f"Request failed with status code {response.status_code}") print(response.json()) diff --git a/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py b/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py index 02c15a5976..6789316d51 100644 --- a/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +++ b/xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py @@ -26,7 +26,7 @@ def uvr5_cli( output_folder: Path, audio_files: list[Path] | None = None, output_format: str = "flac", - model: str = "BS-Roformer-Viperx-1296.ckpt", + model: str = "BS-Roformer-Viperx-1297.ckpt", ): # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"] sepr = Separator( diff --git a/xinference/thirdparty/fish_speech/tools/smart_pad.py b/xinference/thirdparty/fish_speech/tools/smart_pad.py index 9772168f51..de9dc154f2 100644 --- a/xinference/thirdparty/fish_speech/tools/smart_pad.py +++ b/xinference/thirdparty/fish_speech/tools/smart_pad.py @@ -15,21 +15,34 @@ def process(file): waveform, sample_rate = torchaudio.load(str(file), backend="sox") + if waveform.size(0) > 1: + waveform = waveform.mean(dim=0, keepdim=True) + loudness = librosa.feature.rms( y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True )[0] + for i in range(len(loudness) - 1, 0, -1): if loudness[i] > threshold: break - silent_time = (len(loudness) - i) * 512 / sample_rate + end_silent_time = (len(loudness) - i) * 512 / sample_rate - if silent_time <= 0.3: - random_time = random.uniform(0.3, 0.7) + if end_silent_time <= 0.3: + random_time = random.uniform(0.3, 0.7) - end_silent_time waveform = F.pad( waveform, (0, int(random_time * sample_rate)), mode="constant", value=0 ) + for i in range(len(loudness)): + if loudness[i] > threshold: + break + + start_silent_time = i * 512 / sample_rate + + if start_silent_time > 0.02: + waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :] + torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate) diff --git a/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py b/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py index bc6bc40830..c24eb3f46a 100644 --- a/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +++ b/xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py @@ -42,7 +42,7 @@ @lru_cache(maxsize=1) def get_model( config_name: str = "firefly_gan_vq", - checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", device: str | torch.device = "cuda", ): with initialize(version_base="1.3", config_path="../../fish_speech/configs"): @@ -133,7 +133,7 @@ def process_batch(files: list[Path], model) -> float: @click.option("--config-name", default="firefly_gan_vq") @click.option( "--checkpoint-path", - default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ) @click.option("--batch-size", default=64) @click.option("--filelist", default=None, type=Path) diff --git a/xinference/thirdparty/fish_speech/tools/vqgan/inference.py b/xinference/thirdparty/fish_speech/tools/vqgan/inference.py index 17c9034d7b..b6bc7531c4 100644 --- a/xinference/thirdparty/fish_speech/tools/vqgan/inference.py +++ b/xinference/thirdparty/fish_speech/tools/vqgan/inference.py @@ -59,7 +59,7 @@ def load_model(config_name, checkpoint_path, device="cuda"): @click.option("--config-name", default="firefly_gan_vq") @click.option( "--checkpoint-path", - default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ) @click.option( "--device", @@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device): # Restore feature_lengths = torch.tensor([indices.shape[1]], device=device) - fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths) + fake_audios, _ = model.decode( + indices=indices[None], feature_lengths=feature_lengths + ) audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate logger.info( diff --git a/xinference/thirdparty/fish_speech/tools/webui.py b/xinference/thirdparty/fish_speech/tools/webui.py index f64ff923b0..a52f548cc9 100644 --- a/xinference/thirdparty/fish_speech/tools/webui.py +++ b/xinference/thirdparty/fish_speech/tools/webui.py @@ -23,7 +23,6 @@ from fish_speech.text.chn_text_norm.text import Text as ChnNormedText from fish_speech.utils import autocast_exclude_mps from tools.api import decode_vq_tokens, encode_reference -from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model from tools.llama.generate import ( GenerateRequest, GenerateResponse, @@ -40,9 +39,9 @@ {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")} -{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")} +{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).")} -{i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")} +{i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")} {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")} """ @@ -160,66 +159,6 @@ def inference( gc.collect() -def inference_with_auto_rerank( - text, - enable_reference_audio, - reference_audio, - reference_text, - max_new_tokens, - chunk_length, - top_p, - repetition_penalty, - temperature, - use_auto_rerank, - streaming=False, -): - - max_attempts = 2 if use_auto_rerank else 1 - best_wer = float("inf") - best_audio = None - best_sample_rate = None - - for attempt in range(max_attempts): - audio_generator = inference( - text, - enable_reference_audio, - reference_audio, - reference_text, - max_new_tokens, - chunk_length, - top_p, - repetition_penalty, - temperature, - streaming=False, - ) - - # 获取音频数据 - for _ in audio_generator: - pass - _, (sample_rate, audio), message = _ - - if audio is None: - return None, None, message - - if not use_auto_rerank: - return None, (sample_rate, audio), None - - asr_result = batch_asr(asr_model, [audio], sample_rate)[0] - wer = calculate_wer(text, asr_result["text"]) - if wer <= 0.3 and not asr_result["huge_gap"]: - return None, (sample_rate, audio), None - - if wer < best_wer: - best_wer = wer - best_audio = audio - best_sample_rate = sample_rate - - if attempt == max_attempts - 1: - break - - return None, (best_sample_rate, best_audio), None - - inference_stream = partial(inference, streaming=True) n_audios = 4 @@ -239,13 +178,12 @@ def inference_wrapper( repetition_penalty, temperature, batch_infer_num, - if_load_asr_model, ): audios = [] errors = [] for _ in range(batch_infer_num): - result = inference_with_auto_rerank( + result = inference( text, enable_reference_audio, reference_audio, @@ -255,10 +193,9 @@ def inference_wrapper( top_p, repetition_penalty, temperature, - if_load_asr_model, ) - _, audio_data, error_message = result + _, audio_data, error_message = next(result) audios.append( gr.Audio(value=audio_data if audio_data else None, visible=True), @@ -301,42 +238,6 @@ def normalize_text(user_input, use_normalization): asr_model = None -def change_if_load_asr_model(if_load): - global asr_model - - if if_load: - gr.Warning("Loading faster whisper model...") - if asr_model is None: - asr_model = load_model() - return gr.Checkbox(label="Unload faster whisper model", value=if_load) - - if if_load is False: - gr.Warning("Unloading faster whisper model...") - del asr_model - asr_model = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - gc.collect() - return gr.Checkbox(label="Load faster whisper model", value=if_load) - - -def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text): - if if_load and asr_model is not None: - if ( - if_auto_label - and enable_ref - and ref_audio is not None - and ref_text.strip() == "" - ): - data, sample_rate = librosa.load(ref_audio) - res = batch_asr(asr_model, [data], sample_rate)[0] - ref_text = res["text"] - else: - gr.Warning("Whisper model not loaded!") - - return gr.Textbox(value=ref_text) - - def build_app(): with gr.Blocks(theme=gr.themes.Base()) as app: gr.Markdown(HEADER_MD) @@ -367,23 +268,17 @@ def build_app(): with gr.Row(): if_refine_text = gr.Checkbox( label=i18n("Text Normalization"), - value=True, - scale=1, - ) - - if_load_asr_model = gr.Checkbox( - label=i18n("Load / Unload ASR model for auto-reranking"), value=False, - scale=3, + scale=1, ) with gr.Row(): with gr.Tab(label=i18n("Advanced Config")): chunk_length = gr.Slider( label=i18n("Iterative Prompt Length, 0 means off"), - minimum=0, - maximum=500, - value=100, + minimum=50, + maximum=300, + value=200, step=8, ) @@ -434,12 +329,6 @@ def build_app(): type="filepath", ) with gr.Row(): - if_auto_label = gr.Checkbox( - label=i18n("Auto Labeling"), - min_width=100, - scale=0, - value=False, - ) reference_text = gr.Textbox( label=i18n("Reference Text"), lines=1, @@ -494,28 +383,6 @@ def build_app(): fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text] ) - if_load_asr_model.change( - fn=change_if_load_asr_model, - inputs=[if_load_asr_model], - outputs=[if_load_asr_model], - ) - - if_auto_label.change( - fn=lambda: gr.Textbox(value=""), - inputs=[], - outputs=[reference_text], - ).then( - fn=change_if_auto_label, - inputs=[ - if_load_asr_model, - if_auto_label, - enable_reference_audio, - reference_audio, - reference_text, - ], - outputs=[reference_text], - ) - # # Submit generate.click( inference_wrapper, @@ -530,7 +397,6 @@ def build_app(): repetition_penalty, temperature, batch_infer_num, - if_load_asr_model, ], [stream_audio, *global_audio_list, *global_error_list], concurrency_limit=1, @@ -560,12 +426,12 @@ def parse_args(): parser.add_argument( "--llama-checkpoint-path", type=Path, - default="checkpoints/fish-speech-1.2-sft", + default="checkpoints/fish-speech-1.4", ) parser.add_argument( "--decoder-checkpoint-path", type=Path, - default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth", + default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth", ) parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq") parser.add_argument("--device", type=str, default="cuda") @@ -605,8 +471,8 @@ def parse_args(): enable_reference_audio=False, reference_audio=None, reference_text="", - max_new_tokens=0, - chunk_length=100, + max_new_tokens=1024, + chunk_length=200, top_p=0.7, repetition_penalty=1.2, temperature=0.7, From 42d9c340c4ae084c71b215c88d30c2c9d636508e Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 13 Sep 2024 12:16:59 +0800 Subject: [PATCH 04/17] FEAT: support sdapi/img2img (#2293) --- xinference/api/restful_api.py | 67 ++++++++++++++++++++++++++- xinference/core/model.py | 14 ++++++ xinference/model/image/sdapi.py | 80 +++++++++++++++++++++++++++------ xinference/types.py | 2 +- 4 files changed, 147 insertions(+), 16 deletions(-) diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index 15b5cc52d4..d12273ba13 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -63,7 +63,7 @@ CreateCompletion, ImageList, PeftModelConfig, - SDAPITxt2imgResult, + SDAPIResult, VideoList, max_tokens_field, ) @@ -138,6 +138,24 @@ class SDAPITxt2imgRequst(BaseModel): width: Optional[int] = 512 height: Optional[int] = 512 sampler_name: Optional[str] = None + denoising_strength: Optional[float] = None + kwargs: Optional[str] = None + user: Optional[str] = None + + +class SDAPIImg2imgRequst(BaseModel): + model: Optional[str] + init_images: Optional[list] + prompt: Optional[str] = "" + negative_prompt: Optional[str] = "" + steps: Optional[int] = None + seed: Optional[int] = -1 + cfg_scale: Optional[float] = 7.0 + override_settings: Optional[dict] = {} + width: Optional[int] = 512 + height: Optional[int] = 512 + sampler_name: Optional[str] = None + denoising_strength: Optional[float] = None kwargs: Optional[str] = None user: Optional[str] = None @@ -574,7 +592,18 @@ async def internal_exception_handler(request: Request, exc: Exception): "/sdapi/v1/txt2img", self.sdapi_txt2img, methods=["POST"], - response_model=SDAPITxt2imgResult, + response_model=SDAPIResult, + dependencies=( + [Security(self._auth_service, scopes=["models:read"])] + if self.is_authenticated() + else None + ), + ) + self._router.add_api_route( + "/sdapi/v1/img2img", + self.sdapi_img2img, + methods=["POST"], + response_model=SDAPIResult, dependencies=( [Security(self._auth_service, scopes=["models:read"])] if self.is_authenticated() @@ -1569,6 +1598,40 @@ async def sdapi_txt2img(self, request: Request) -> Response: await self._report_error_event(model_uid, str(e)) raise HTTPException(status_code=500, detail=str(e)) + async def sdapi_img2img(self, request: Request) -> Response: + body = SDAPIImg2imgRequst.parse_obj(await request.json()) + model_uid = body.model or body.override_settings.get("sd_model_checkpoint") + + try: + if not model_uid: + raise ValueError("Unknown model") + model = await (await self._get_supervisor_ref()).get_model(model_uid) + except ValueError as ve: + logger.error(str(ve), exc_info=True) + await self._report_error_event(model_uid, str(ve)) + raise HTTPException(status_code=400, detail=str(ve)) + except Exception as e: + logger.error(e, exc_info=True) + await self._report_error_event(model_uid, str(e)) + raise HTTPException(status_code=500, detail=str(e)) + + try: + kwargs = dict(body) + kwargs.update(json.loads(body.kwargs) if body.kwargs else {}) + image_list = await model.img2img( + **kwargs, + ) + return Response(content=image_list, media_type="application/json") + except RuntimeError as re: + logger.error(re, exc_info=True) + await self._report_error_event(model_uid, str(re)) + self.handle_request_limit_error(re) + raise HTTPException(status_code=400, detail=str(re)) + except Exception as e: + logger.error(e, exc_info=True) + await self._report_error_event(model_uid, str(e)) + raise HTTPException(status_code=500, detail=str(e)) + async def create_variations( self, model: str = Form(...), diff --git a/xinference/core/model.py b/xinference/core/model.py index 327582163c..1f711fb117 100644 --- a/xinference/core/model.py +++ b/xinference/core/model.py @@ -793,6 +793,20 @@ async def image_to_image( f"Model {self._model.model_spec} is not for creating image." ) + @request_limit + @log_async(logger=logger) + async def img2img( + self, + **kwargs, + ): + kwargs.pop("request_id", None) + if hasattr(self._model, "img2img"): + return await self._call_wrapper_json( + self._model.img2img, + **kwargs, + ) + raise AttributeError(f"Model {self._model.model_spec} is not for img2img.") + @log_async( logger=logger, ignore_kwargs=["image"], diff --git a/xinference/model/image/sdapi.py b/xinference/model/image/sdapi.py index 10337b114d..b3af166299 100644 --- a/xinference/model/image/sdapi.py +++ b/xinference/model/image/sdapi.py @@ -11,30 +11,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import base64 +import io import warnings +from PIL import Image + class SDAPIToDiffusersConverter: - txt2img_identical_args = [ + txt2img_identical_args = { "prompt", "negative_prompt", "seed", "width", "height", "sampler_name", - ] + } txt2img_arg_mapping = { "steps": "num_inference_steps", "cfg_scale": "guidance_scale", + "denoising_strength": "strength", + } + img2img_identical_args = { + "prompt", + "negative_prompt", + "seed", + "width", + "height", + "sampler_name", + } + img2img_arg_mapping = { + "init_images": "image", + "steps": "num_inference_steps", + "cfg_scale": "guidance_scale", + "denoising_strength": "strength", } @staticmethod - def convert_txt2img_to_diffusers(params: dict) -> dict: + def convert_to_diffusers(sd_type: str, params: dict) -> dict: diffusers_params = {} - identical_args = set(SDAPIToDiffusersConverter.txt2img_identical_args) - mapping_args = SDAPIToDiffusersConverter.txt2img_arg_mapping + identical_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_identical_args") + mapping_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_arg_mapping") for param, value in params.items(): if param in identical_args: diffusers_params[param] = value @@ -45,13 +63,17 @@ def convert_txt2img_to_diffusers(params: dict) -> dict: return diffusers_params + @staticmethod + def get_available_args(sd_type: str) -> set: + identical_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_identical_args") + mapping_args = getattr(SDAPIToDiffusersConverter, f"{sd_type}_arg_mapping") + return identical_args.union(mapping_args) + class SDAPIDiffusionModelMixin: - def txt2img(self, **kwargs): - available_args = set( - SDAPIToDiffusersConverter.txt2img_identical_args - + list(SDAPIToDiffusersConverter.txt2img_arg_mapping) - ) + @staticmethod + def _check_kwargs(sd_type: str, kwargs: dict): + available_args = SDAPIToDiffusersConverter.get_available_args(sd_type) unknown_args = [] available_kwargs = {} for arg, value in kwargs.items(): @@ -64,14 +86,20 @@ def txt2img(self, **kwargs): f"Some args are not supported for now and will be ignored: {unknown_args}" ) - converted_kwargs = SDAPIToDiffusersConverter.convert_txt2img_to_diffusers( - available_kwargs + converted_kwargs = SDAPIToDiffusersConverter.convert_to_diffusers( + sd_type, available_kwargs ) + width, height = converted_kwargs.pop("width", None), converted_kwargs.pop( "height", None ) if width and height: converted_kwargs["size"] = f"{width}*{height}" + + return converted_kwargs + + def txt2img(self, **kwargs): + converted_kwargs = self._check_kwargs("txt2img", kwargs) result = self.text_to_image(response_format="b64_json", **converted_kwargs) # type: ignore # convert to SD API result @@ -80,3 +108,29 @@ def txt2img(self, **kwargs): "info": {"created": result["created"]}, "parameters": {}, } + + @staticmethod + def _decode_b64_img(img_str: str) -> Image: + # img_str in a format: "data:image/png;base64," + raw_b64_img(image) + f, data = img_str.split(",", 1) + f, encode_type = f.split(";", 1) + assert encode_type == "base64" + f = f.split("/", 1)[1] + b = base64.b64decode(data) + return Image.open(io.BytesIO(b), formats=[f]) + + def img2img(self, **kwargs): + init_images = kwargs.pop("init_images", []) + kwargs["init_images"] = [self._decode_b64_img(i) for i in init_images] + clip_skip = kwargs.get("override_settings", {}).get("clip_skip") + converted_kwargs = self._check_kwargs("img2img", kwargs) + if clip_skip: + converted_kwargs["clip_skip"] = clip_skip + result = self.image_to_image(response_format="b64_json", **converted_kwargs) # type: ignore + + # convert to SD API result + return { + "images": [r["b64_json"] for r in result["data"]], + "info": {"created": result["created"]}, + "parameters": {}, + } diff --git a/xinference/types.py b/xinference/types.py index 31c0c28635..613d8709bb 100644 --- a/xinference/types.py +++ b/xinference/types.py @@ -47,7 +47,7 @@ class ImageList(TypedDict): data: List[Image] -class SDAPITxt2imgResult(TypedDict): +class SDAPIResult(TypedDict): images: List[str] parameters: dict info: dict From a9380becb24eec1e747a83cf7319a895c5dc3e71 Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 13 Sep 2024 15:24:52 +0800 Subject: [PATCH 05/17] FEAT: support flux.1 image2image and inpainting (#2296) --- .github/workflows/python.yaml | 1 + xinference/model/image/model_spec.json | 8 ++++++-- .../model/image/model_spec_modelscope.json | 8 ++++++-- .../model/image/stable_diffusion/core.py | 20 ++++++++++++++++--- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 5be70aa4a0..5c75b2814c 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -171,6 +171,7 @@ jobs: ${{ env.SELF_HOST_PYTHON }} -m pip install -U "loguru" ${{ env.SELF_HOST_PYTHON }} -m pip install -U "natsort" ${{ env.SELF_HOST_PYTHON }} -m pip install -U "loralib" + ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ormsgpack" ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y opencc ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper" ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \ diff --git a/xinference/model/image/model_spec.json b/xinference/model/image/model_spec.json index 891e9d5765..04386dd2e5 100644 --- a/xinference/model/image/model_spec.json +++ b/xinference/model/image/model_spec.json @@ -5,7 +5,9 @@ "model_id": "black-forest-labs/FLUX.1-schnell", "model_revision": "768d12a373ed5cc9ef9a9dea7504dc09fcc14842", "model_ability": [ - "text2image" + "text2image", + "image2image", + "inpainting" ] }, { @@ -14,7 +16,9 @@ "model_id": "black-forest-labs/FLUX.1-dev", "model_revision": "01aa605f2c300568dd6515476f04565a954fcb59", "model_ability": [ - "text2image" + "text2image", + "image2image", + "inpainting" ] }, { diff --git a/xinference/model/image/model_spec_modelscope.json b/xinference/model/image/model_spec_modelscope.json index bbc5d57010..b39bfc543d 100644 --- a/xinference/model/image/model_spec_modelscope.json +++ b/xinference/model/image/model_spec_modelscope.json @@ -6,7 +6,9 @@ "model_id": "AI-ModelScope/FLUX.1-schnell", "model_revision": "master", "model_ability": [ - "text2image" + "text2image", + "image2image", + "inpainting" ] }, { @@ -16,7 +18,9 @@ "model_id": "AI-ModelScope/FLUX.1-dev", "model_revision": "master", "model_ability": [ - "text2image" + "text2image", + "image2image", + "inpainting" ] }, { diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py index 5344e62de2..eed9739b2c 100644 --- a/xinference/model/image/stable_diffusion/core.py +++ b/xinference/model/image/stable_diffusion/core.py @@ -14,6 +14,7 @@ import base64 import contextlib +import inspect import logging import os import re @@ -408,12 +409,24 @@ def image_to_image( width, height = image.size kwargs["width"] = width kwargs["height"] = height - + else: + # SD3 image2image cannot accept width and height + parameters = inspect.signature(model.__call__).parameters # type: ignore + allow_width_height = False + for param in parameters.values(): + if param.kind == inspect.Parameter.VAR_KEYWORD: + allow_width_height = True + break + if "width" in parameters or "height" in parameters: + allow_width_height = True + if allow_width_height: + kwargs["width"], kwargs["height"] = image.size + + kwargs["negative_prompt"] = negative_prompt self._filter_kwargs(kwargs) return self._call_model( image=image, prompt=prompt, - negative_prompt=negative_prompt, num_images_per_prompt=n, response_format=response_format, model=model, @@ -463,11 +476,12 @@ def inpainting( # calculate actual image size after padding width, height = image.size + kwargs["negative_prompt"] = negative_prompt + self._filter_kwargs(kwargs) return self._call_model( image=image, mask_image=mask_image, prompt=prompt, - negative_prompt=negative_prompt, height=height, width=width, num_images_per_prompt=n, From b7c70229886ab06f4e8d0d58ddeb91093f7801bd Mon Sep 17 00:00:00 2001 From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com> Date: Fri, 13 Sep 2024 22:22:37 +0800 Subject: [PATCH 06/17] FEAT: Support yi-coder-chat (#2302) Co-authored-by: JunHowie --- xinference/model/llm/llm_family.json | 77 ++++++++++++++++++ .../model/llm/llm_family_modelscope.json | 81 +++++++++++++++++++ xinference/model/llm/vllm/core.py | 2 + 3 files changed, 160 insertions(+) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index e997098e65..1dfeca1fb4 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -7093,5 +7093,82 @@ "stop": [ "<|end▁of▁sentence|>" ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "yi-coder-chat", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "none" + ], + "model_id": "01ai/Yi-Coder-9B-Chat", + "model_revision": "356a1f8d4e4a606d0b879e54191ca809918576b8" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "none" + ], + "model_id": "01ai/Yi-Coder-1.5B-Chat", + "model_revision": "92fdd1b2f1539ac990e7f4a921db5601da2f0299" + } + ], + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 6, + 7 + ], + "stop": [ + "<|startoftext|>", + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "yi-coder", + "model_lang": [ + "en" + ], + "model_ability": [ + "generate" + ], + "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "none" + ], + "model_id": "01-ai/Yi-Coder-9B", + "model_revision": "e20f8087a9507ac8bce409dc5db5d0c608124238" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "none" + ], + "model_id": "01-ai/Yi-Coder-1.5B", + "model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109" + } + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index f4386e85fa..b7b0da1b13 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4808,5 +4808,86 @@ "stop": [ "<|end▁of▁sentence|>" ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "yi-coder-chat", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-Coder-9B-Chat", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-Coder-1.5B-Chat", + "model_revision": "master" + } + ], + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 6, + 7 + ], + "stop": [ + "<|startoftext|>", + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "yi-coder", + "model_lang": [ + "en" + ], + "model_ability": [ + "generate" + ], + "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-Coder-9B", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-Coder-1.5B", + "model_revision": "master" + } + ] } ] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index e531769a18..811fd5d342 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -104,6 +104,7 @@ class VLLMGenerateConfig(TypedDict, total=False): "code-llama-python", "deepseek", "deepseek-coder", + "yi-coder", ] VLLM_SUPPORTED_CHAT_MODELS = [ "llama-2-chat", @@ -130,6 +131,7 @@ class VLLMGenerateConfig(TypedDict, total=False): "codegeex4", "deepseek-chat", "deepseek-coder-instruct", + "yi-coder-chat", ] if VLLM_INSTALLED and vllm.__version__ >= "0.3.0": VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat") From 26666356f80f61d21cca80389f7fc47ea9c1caa7 Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Sat, 14 Sep 2024 12:27:04 +0800 Subject: [PATCH 07/17] BUG: fix sampler_name for img2img (#2301) --- xinference/model/image/sdapi.py | 2 +- .../model/image/stable_diffusion/core.py | 69 +++++++++---------- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/xinference/model/image/sdapi.py b/xinference/model/image/sdapi.py index b3af166299..6ef21d48ab 100644 --- a/xinference/model/image/sdapi.py +++ b/xinference/model/image/sdapi.py @@ -30,7 +30,7 @@ class SDAPIToDiffusersConverter: txt2img_arg_mapping = { "steps": "num_inference_steps", "cfg_scale": "guidance_scale", - "denoising_strength": "strength", + # "denoising_strength": "strength", } img2img_identical_args = { "prompt", diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py index eed9739b2c..5a7e99fe33 100644 --- a/xinference/model/image/stable_diffusion/core.py +++ b/xinference/model/image/stable_diffusion/core.py @@ -24,7 +24,7 @@ from concurrent.futures import ThreadPoolExecutor from functools import partial from io import BytesIO -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import PIL.Image import torch @@ -168,7 +168,9 @@ def load(self): self._kwargs[text_encoder_name] = text_encoder self._kwargs["device_map"] = "balanced" - logger.debug("Loading model %s", AutoPipelineModel) + logger.debug( + "Loading model from %s, kwargs: %s", self._model_path, self._kwargs + ) self._model = AutoPipelineModel.from_pretrained( self._model_path, **self._kwargs, @@ -183,11 +185,12 @@ def load(self): self._model.enable_attention_slicing() self._apply_lora() - def _get_scheduler(self, sampler_name: str): + @staticmethod + def _get_scheduler(model: Any, sampler_name: str): if not sampler_name: return - assert self._model is not None + assert model is not None import diffusers @@ -195,80 +198,73 @@ def _get_scheduler(self, sampler_name: str): # to get A1111 <> Diffusers Scheduler mapping if sampler_name == "DPM++ 2M": return diffusers.DPMSolverMultistepScheduler.from_config( - self._model.scheduler.config + model.scheduler.config ) elif sampler_name == "DPM++ 2M Karras": return diffusers.DPMSolverMultistepScheduler.from_config( - self._model.scheduler.config, use_karras_sigmas=True + model.scheduler.config, use_karras_sigmas=True ) elif sampler_name == "DPM++ 2M SDE": return diffusers.DPMSolverMultistepScheduler.from_config( - self._model.scheduler.config, algorithm_type="sde-dpmsolver++" + model.scheduler.config, algorithm_type="sde-dpmsolver++" ) elif sampler_name == "DPM++ 2M SDE Karras": return diffusers.DPMSolverMultistepScheduler.from_config( - self._model.scheduler.config, + model.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True, ) elif sampler_name == "DPM++ SDE": return diffusers.DPMSolverSinglestepScheduler.from_config( - self._model.scheduler.config + model.scheduler.config ) elif sampler_name == "DPM++ SDE Karras": return diffusers.DPMSolverSinglestepScheduler.from_config( - self._model.scheduler.config, use_karras_sigmas=True + model.scheduler.config, use_karras_sigmas=True ) elif sampler_name == "DPM2": - return diffusers.KDPM2DiscreteScheduler.from_config( - self._model.scheduler.config - ) + return diffusers.KDPM2DiscreteScheduler.from_config(model.scheduler.config) elif sampler_name == "DPM2 Karras": return diffusers.KDPM2DiscreteScheduler.from_config( - self._model.scheduler.config, use_karras_sigmas=True + model.scheduler.config, use_karras_sigmas=True ) elif sampler_name == "DPM2 a": return diffusers.KDPM2AncestralDiscreteScheduler.from_config( - self._model.scheduler.config + model.scheduler.config ) elif sampler_name == "DPM2 a Karras": return diffusers.KDPM2AncestralDiscreteScheduler.from_config( - self._model.scheduler.config, use_karras_sigmas=True + model.scheduler.config, use_karras_sigmas=True ) elif sampler_name == "Euler": - return diffusers.EulerDiscreteScheduler.from_config( - self._model.scheduler.config - ) + return diffusers.EulerDiscreteScheduler.from_config(model.scheduler.config) elif sampler_name == "Euler a": return diffusers.EulerAncestralDiscreteScheduler.from_config( - self._model.scheduler.config + model.scheduler.config ) elif sampler_name == "Heun": - return diffusers.HeunDiscreteScheduler.from_config( - self._model.scheduler.config - ) + return diffusers.HeunDiscreteScheduler.from_config(model.scheduler.config) elif sampler_name == "LMS": - return diffusers.LMSDiscreteScheduler.from_config( - self._model.scheduler.config - ) + return diffusers.LMSDiscreteScheduler.from_config(model.scheduler.config) elif sampler_name == "LMS Karras": return diffusers.LMSDiscreteScheduler.from_config( - self._model.scheduler.config, use_karras_sigmas=True + model.scheduler.config, use_karras_sigmas=True ) else: raise ValueError(f"Unknown sampler: {sampler_name}") + @staticmethod @contextlib.contextmanager - def _reset_when_done(self, sampler_name: str): - assert self._model is not None - scheduler = self._get_scheduler(sampler_name) + def _reset_when_done(model: Any, sampler_name: str): + assert model is not None + scheduler = DiffusionModel._get_scheduler(model, sampler_name) if scheduler: - default_scheduler = self._model.scheduler - self._model.scheduler = scheduler + default_scheduler = model.scheduler + model.scheduler = scheduler try: yield finally: - self._model.scheduler = default_scheduler + model.scheduler = default_scheduler else: yield @@ -292,11 +288,8 @@ def _call_model( kwargs["generator"] = generator.manual_seed(seed) sampler_name = kwargs.pop("sampler_name", None) assert callable(model) - with self._reset_when_done(sampler_name): - logger.debug( - "stable diffusion args: %s", - kwargs, - ) + with self._reset_when_done(model, sampler_name): + logger.debug("stable diffusion args: %s, model: %s", kwargs, model) images = model(**kwargs).images # revert padding if padded From 961d355102007e3cd7963a353105b2422a31d4fd Mon Sep 17 00:00:00 2001 From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com> Date: Sat, 14 Sep 2024 07:22:13 +0200 Subject: [PATCH 08/17] FEAT: qwen2 audio (#2271) --- xinference/core/tests/test_restful_api.py | 74 ++++++++ xinference/model/llm/__init__.py | 2 + xinference/model/llm/llm_family.json | 74 ++++++++ xinference/model/llm/llm_family.py | 4 +- .../model/llm/llm_family_modelscope.json | 68 +++++++ xinference/model/llm/transformers/core.py | 2 + .../model/llm/transformers/qwen2_audio.py | 168 ++++++++++++++++++ 7 files changed, 391 insertions(+), 1 deletion(-) create mode 100644 xinference/model/llm/transformers/qwen2_audio.py diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py index 0c50eb256d..af22ca7a8b 100644 --- a/xinference/core/tests/test_restful_api.py +++ b/xinference/core/tests/test_restful_api.py @@ -1240,3 +1240,77 @@ def test_launch_model_by_version(setup): # delete again url = f"{endpoint}/v1/models/test_qwen15" requests.delete(url) + + +@pytest.mark.skip(reason="Cost too many resources.") +def test_restful_api_for_qwen_audio(setup): + model_name = "qwen2-audio-instruct" + + endpoint, _ = setup + url = f"{endpoint}/v1/models" + + # list + response = requests.get(url) + response_data = response.json() + assert len(response_data["data"]) == 0 + + # launch + payload = { + "model_uid": "test_audio", + "model_name": model_name, + "model_engine": "transformers", + "model_size_in_billions": 7, + "model_format": "pytorch", + "quantization": "none", + } + + response = requests.post(url, json=payload) + response_data = response.json() + model_uid_res = response_data["model_uid"] + assert model_uid_res == "test_audio" + + response = requests.get(url) + response_data = response.json() + assert len(response_data["data"]) == 1 + + url = f"{endpoint}/v1/chat/completions" + payload = { + "model": model_uid_res, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + { + "type": "audio", + "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", + }, + {"type": "text", "text": "What's that sound?"}, + ], + }, + {"role": "assistant", "content": "It is the sound of glass shattering."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What can you do when you hear that?"}, + ], + }, + { + "role": "assistant", + "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.", + }, + { + "role": "user", + "content": [ + { + "type": "audio", + "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac", + }, + {"type": "text", "text": "What does the person say?"}, + ], + }, + ], + } + response = requests.post(url, json=payload) + completion = response.json() + assert len(completion["choices"][0]["message"]) > 0 diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index 5a7895eb1a..f971e65661 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -146,6 +146,7 @@ def _install(): from .transformers.internlm2 import Internlm2PytorchChatModel from .transformers.minicpmv25 import MiniCPMV25Model from .transformers.minicpmv26 import MiniCPMV26Model + from .transformers.qwen2_audio import Qwen2AudioChatModel from .transformers.qwen2_vl import Qwen2VLChatModel from .transformers.qwen_vl import QwenVLChatModel from .transformers.yi_vl import YiVLChatModel @@ -177,6 +178,7 @@ def _install(): Internlm2PytorchChatModel, QwenVLChatModel, Qwen2VLChatModel, + Qwen2AudioChatModel, YiVLChatModel, DeepSeekVLChatModel, InternVLChatModel, diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 1dfeca1fb4..77dda1a84d 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -6947,6 +6947,80 @@ "" ] }, + { + "version":1, + "context_length":32768, + "model_name":"qwen2-audio-instruct", + "model_lang":[ + "en", + "zh" + ], + "model_ability":[ + "chat", + "audio" + ], + "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.", + "model_specs":[ + { + "model_format":"pytorch", + "model_size_in_billions":7, + "quantizations":[ + "none" + ], + "model_id":"Qwen/Qwen2-Audio-7B-Instruct", + "model_revision":"bac62d2c6808845904c709c17a0402d817558c64" + } + ], + "prompt_style":{ + "style_name":"QWEN", + "system_prompt":"You are a helpful assistant", + "roles":[ + "user", + "assistant" + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ] + } + }, + { + "version":1, + "context_length":32768, + "model_name":"qwen2-audio", + "model_lang":[ + "en", + "zh" + ], + "model_ability":[ + "chat", + "audio" + ], + "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.", + "model_specs":[ + { + "model_format":"pytorch", + "model_size_in_billions":7, + "quantizations":[ + "none" + ], + "model_id":"Qwen/Qwen2-Audio-7B", + "model_revision":"8577bc71d330c8fa32ffe9f8a1374100759f2466" + } + ], + "prompt_style":{ + "style_name":"QWEN", + "system_prompt":"You are a helpful assistant", + "roles":[ + "user", + "assistant" + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ] + } + }, { "version": 1, "context_length": 128000, diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py index 555921f18f..413b4229ae 100644 --- a/xinference/model/llm/llm_family.py +++ b/xinference/model/llm/llm_family.py @@ -132,7 +132,9 @@ class LLMFamilyV1(BaseModel): context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH model_name: str model_lang: List[str] - model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]] + model_ability: List[ + Literal["embed", "generate", "chat", "tools", "vision", "audio"] + ] model_description: Optional[str] # reason for not required str here: legacy registration model_family: Optional[str] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index b7b0da1b13..fdaab458aa 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4656,6 +4656,74 @@ "" ] }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen2-audio-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "audio" + ], + "model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "qwen/Qwen2-Audio-7B-Instruct", + "model_revision": "master" + } + ], + "prompt_style": { + "style_name": "QWEN", + "system_prompt": "You are a helpful assistant", + "roles": [ + "user", + "assistant" + ] + } + }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen2-audio", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "audio" + ], + "model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "qwen/Qwen2-Audio-7B", + "model_revision": "master" + } + ], + "prompt_style": { + "style_name": "QWEN", + "system_prompt": "You are a helpful assistant", + "roles": [ + "user", + "assistant" + ] + } + }, { "version": 1, "context_length": 128000, diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index a451b7accd..e42ca6d513 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -65,6 +65,8 @@ "MiniCPM-V-2.6", "glm-4v", "qwen2-vl-instruct", + "qwen2-audio", + "qwen2-audio-instruct", "deepseek-v2", "deepseek-v2-chat", "deepseek-v2.5", diff --git a/xinference/model/llm/transformers/qwen2_audio.py b/xinference/model/llm/transformers/qwen2_audio.py new file mode 100644 index 0000000000..653f7217f8 --- /dev/null +++ b/xinference/model/llm/transformers/qwen2_audio.py @@ -0,0 +1,168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import uuid +from io import BytesIO +from typing import Dict, Iterator, List, Optional, Union +from urllib.request import urlopen + +import numpy as np + +from ....model.utils import select_device +from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk +from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import generate_chat_completion, generate_completion_chunk +from .core import PytorchChatModel, PytorchGenerateConfig + +logger = logging.getLogger(__name__) + + +class Qwen2AudioChatModel(PytorchChatModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._processor = None + self._model = None + self._device = None + + @classmethod + def match( + cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str + ) -> bool: + llm_family = model_family.model_family or model_family.model_name + if "qwen2-audio".lower() in llm_family.lower(): + return True + return False + + def load(self): + from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration + + device = self._pytorch_model_config.get("device", "auto") + device = select_device(device) + self._device = device + # for multiple GPU, set back to auto to make multiple devices work + device = "auto" if device == "cuda" else device + + self._processor = AutoProcessor.from_pretrained( + self.model_path, + device_map=device, + # trust_remote_code=True, + code_revision=self.model_spec.model_revision, + ) + self._model = Qwen2AudioForConditionalGeneration.from_pretrained( + self.model_path, + device_map=device, + # trust_remote_code=True, + revision=self.model_spec.model_revision, + ) + + def _transform_messages( + self, + messages: List[Dict], + ): + import librosa + + text = self._processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + audios: List[np.ndarray] = [] + for msg in messages: + content = msg["content"] + if isinstance(content, List): + for item in content: # type: ignore + if item.get("type") == "audio" and "audio_url" in item: + audio = librosa.load( + BytesIO(urlopen(item["audio_url"]).read()), + sr=self._processor.feature_extractor.sampling_rate, + )[0] + audios.append(audio) + + return text, audios + + def chat( + self, + messages: List[Dict], + generate_config: Optional[PytorchGenerateConfig] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + text, audios = self._transform_messages(messages) + inputs = self._processor( + text=text, audios=audios, return_tensors="pt", padding=True + ) + inputs.input_ids = inputs.input_ids.to(self._device) + generate_config = generate_config if generate_config else {} + stream = generate_config.get("stream", False) if generate_config else False + + if stream: + it = self._generate_stream(inputs, generate_config) + return self._to_chat_completion_chunks(it) + else: + c = self._generate(inputs, generate_config) + return c + + def _generate(self, inputs, config: PytorchGenerateConfig = {}) -> ChatCompletion: + generate_ids = self._model.generate( + **inputs, + max_length=config.get("max_tokens", 512), + ) + generate_ids = generate_ids[:, inputs.input_ids.size(1) :] + response = self._processor.batch_decode( + generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + return generate_chat_completion(self.model_uid, response) + + def _generate_stream( + self, inputs, config: PytorchGenerateConfig = {} + ) -> Iterator[CompletionChunk]: + from threading import Thread + + from transformers import TextIteratorStreamer + + tokenizer = self._processor.tokenizer + streamer = TextIteratorStreamer( + tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True + ) + + gen_kwargs = { + "max_new_tokens": config.get("max_tokens", 512), + "streamer": streamer, + **inputs, + } + + thread = Thread(target=self._model.generate, kwargs=gen_kwargs) + thread.start() + + completion_id = str(uuid.uuid1()) + for new_text in streamer: + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + has_choice=True, + has_content=True, + ) + + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + has_choice=True, + has_content=False, + ) From 4aa58615ae4fd4dc3313411f6b485274f7d31c18 Mon Sep 17 00:00:00 2001 From: amumu96 <128140880+amumu96@users.noreply.github.com> Date: Sat, 14 Sep 2024 15:23:21 +0800 Subject: [PATCH 09/17] BUG: modify vllm image version (#2311) --- xinference/deploy/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile index 5ee3f11771..810a440ecd 100644 --- a/xinference/deploy/docker/Dockerfile +++ b/xinference/deploy/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:latest +FROM vllm/vllm-openai:0.6.0 COPY . /opt/inference WORKDIR /opt/inference From 4c5e752920fba416432cad7debd3722a75c3e8a2 Mon Sep 17 00:00:00 2001 From: amumu96 <128140880+amumu96@users.noreply.github.com> Date: Sat, 14 Sep 2024 15:37:10 +0800 Subject: [PATCH 10/17] Bug: modify vllm image version (#2312) Co-authored-by: wuzhaoxin <15667065080@162.com> --- xinference/deploy/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile index 810a440ecd..3d6afc44c3 100644 --- a/xinference/deploy/docker/Dockerfile +++ b/xinference/deploy/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:0.6.0 +FROM vllm/vllm-openai:v0.6.0 COPY . /opt/inference WORKDIR /opt/inference From 91c0fe85cd153158780e717d41bb3fd8036e43ff Mon Sep 17 00:00:00 2001 From: yiboyasss <143868051+yiboyasss@users.noreply.github.com> Date: Sun, 15 Sep 2024 17:19:21 +0800 Subject: [PATCH 11/17] BUG: [UI] Fix registration page bug. (#2315) --- xinference/web/ui/src/scenes/register_model/registerModel.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xinference/web/ui/src/scenes/register_model/registerModel.js b/xinference/web/ui/src/scenes/register_model/registerModel.js index ca02e781b4..f35196b3b5 100644 --- a/xinference/web/ui/src/scenes/register_model/registerModel.js +++ b/xinference/web/ui/src/scenes/register_model/registerModel.js @@ -686,12 +686,12 @@ const RegisterModelComponent = ({ modelType, customData }) => { const handleFamilyAlert = () => { if ( - formData.model_ability.includes('vision') && + formData.model_ability?.includes('vision') && !family?.vision?.includes(formData.model_family) ) { return true } else if ( - formData.model_ability.includes('tools') && + formData.model_ability?.includes('tools') && !family?.tools?.includes(formData.model_family) ) { return true From 065686edc64e0af418ca0071f2f5d541dcdbe440 Mon Sep 17 00:00:00 2001 From: codingl2k1 <138426806+codingl2k1@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:54:00 +0200 Subject: [PATCH 12/17] BUG: Fix CosyVoice missing output (#2320) --- xinference/model/audio/cosyvoice.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xinference/model/audio/cosyvoice.py b/xinference/model/audio/cosyvoice.py index 39bcb7aa6c..9be452f473 100644 --- a/xinference/model/audio/cosyvoice.py +++ b/xinference/model/audio/cosyvoice.py @@ -122,10 +122,10 @@ def _generator_stream(): last_pos = new_last_pos def _generator_block(): - chunk = next(output) - assert isinstance(chunk, dict), "Expected data to be of type dict" + chunks = [o["tts_speech"] for o in output] + t = torch.cat(chunks, dim=1) with BytesIO() as out: - torchaudio.save(out, chunk["tts_speech"], 22050, format=response_format) + torchaudio.save(out, t, 22050, format=response_format) return out.getvalue() return _generator_stream() if stream else _generator_block() From a461ad926fa088d567cd7c96a6aba3468b0a0779 Mon Sep 17 00:00:00 2001 From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com> Date: Thu, 19 Sep 2024 05:40:53 +0800 Subject: [PATCH 13/17] FEAT: Support Qwen 2.5 (#2325) --- xinference/model/llm/llm_family.json | 360 ++++++++++++++++ .../model/llm/llm_family_modelscope.json | 388 ++++++++++++++++++ xinference/model/llm/vllm/core.py | 1 + 3 files changed, 749 insertions(+) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 77dda1a84d..70b17daa61 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -7244,5 +7244,365 @@ "model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109" } ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "qwen2.5-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 3, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct-AWQ" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "0_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 3, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 72, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q5_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "fp16": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ] + } + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index fdaab458aa..7309ee9651 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4957,5 +4957,393 @@ "model_revision": "master" } ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "qwen2.5-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-0.5B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-1.5B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 3, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions":14, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "0_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 3, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 72, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q5_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "fp16": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ] + } + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] } ] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 811fd5d342..3aaee0738f 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -138,6 +138,7 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_MODELS.append("codeqwen1.5") VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat") VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct") + VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct") if VLLM_INSTALLED and vllm.__version__ >= "0.3.2": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it") From 9820786a56ade6af783c69e96a8f30319b30f1f2 Mon Sep 17 00:00:00 2001 From: amumu96 <128140880+amumu96@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:32:00 +0800 Subject: [PATCH 14/17] BUG: support old register llm format (#2335) --- xinference/model/llm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index f971e65661..a9f05a9b25 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -121,7 +121,7 @@ def register_custom_model(): with codecs.open( os.path.join(user_defined_llm_dir, f), encoding="utf-8" ) as fd: - user_defined_llm_family = CustomLLMFamilyV1.parse_obj(json.load(fd)) + user_defined_llm_family = CustomLLMFamilyV1.parse_raw(fd.read()) register_llm(user_defined_llm_family, persist=False) except Exception as e: warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}") From 3cc9bc525667e2161ae072d3d892d33b2723b2a9 Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 20 Sep 2024 15:06:33 +0800 Subject: [PATCH 15/17] BUG: fix stable diffusion from dify tool (#2336) --- xinference/core/model.py | 4 +- .../model/image/stable_diffusion/core.py | 49 ++++++++++++------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/xinference/core/model.py b/xinference/core/model.py index 1f711fb117..2274f422c0 100644 --- a/xinference/core/model.py +++ b/xinference/core/model.py @@ -769,7 +769,7 @@ async def image_to_image( self, image: "PIL.Image", prompt: str, - negative_prompt: str, + negative_prompt: Optional[str] = None, n: int = 1, size: Optional[str] = None, response_format: str = "url", @@ -777,12 +777,12 @@ async def image_to_image( **kwargs, ): kwargs.pop("request_id", None) + kwargs["negative_prompt"] = negative_prompt if hasattr(self._model, "image_to_image"): return await self._call_wrapper_json( self._model.image_to_image, image, prompt, - negative_prompt, n, size, response_format, diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py index 5a7e99fe33..53151b2c19 100644 --- a/xinference/model/image/stable_diffusion/core.py +++ b/xinference/model/image/stable_diffusion/core.py @@ -21,6 +21,7 @@ import sys import time import uuid +import warnings from concurrent.futures import ThreadPoolExecutor from functools import partial from io import BytesIO @@ -31,7 +32,7 @@ from PIL import ImageOps from ....constants import XINFERENCE_IMAGE_DIR -from ....device_utils import move_model_to_available_device +from ....device_utils import get_available_device, move_model_to_available_device from ....types import Image, ImageList, LoRA from ..sdapi import SDAPIDiffusionModelMixin @@ -60,6 +61,23 @@ ] +def model_accept_param(params: Union[str, List[str]], model: Any) -> bool: + params = [params] if isinstance(params, str) else params + # model is diffusers Pipeline + parameters = inspect.signature(model.__call__).parameters # type: ignore + allow_params = False + for param in parameters.values(): + if param.kind == inspect.Parameter.VAR_KEYWORD: + # the __call__ can accept **kwargs, + # we treat it as it can accept any parameters + allow_params = True + break + if not allow_params: + if all(param in parameters for param in params): + allow_params = True + return allow_params + + class DiffusionModel(SDAPIDiffusionModelMixin): def __init__( self, @@ -187,7 +205,7 @@ def load(self): @staticmethod def _get_scheduler(model: Any, sampler_name: str): - if not sampler_name: + if not sampler_name or sampler_name == "default": return assert model is not None @@ -283,13 +301,14 @@ def _call_model( origin_size = kwargs.pop("origin_size", None) seed = kwargs.pop("seed", None) if seed is not None: - kwargs["generator"] = generator = torch.Generator(device=self._model.device) # type: ignore + kwargs["generator"] = generator = torch.Generator(device=get_available_device()) # type: ignore if seed != -1: kwargs["generator"] = generator.manual_seed(seed) sampler_name = kwargs.pop("sampler_name", None) assert callable(model) with self._reset_when_done(model, sampler_name): logger.debug("stable diffusion args: %s, model: %s", kwargs, model) + self._filter_kwargs(model, kwargs) images = model(**kwargs).images # revert padding if padded @@ -328,11 +347,17 @@ def _gen_base64_image(_img): raise ValueError(f"Unsupported response format: {response_format}") @classmethod - def _filter_kwargs(cls, kwargs: dict): + def _filter_kwargs(cls, model, kwargs: dict): for arg in ["negative_prompt", "num_inference_steps"]: if not kwargs.get(arg): kwargs.pop(arg, None) + for key in list(kwargs): + allow_key = model_accept_param(key, model) + if not allow_key: + warnings.warn(f"{type(model)} cannot accept `{key}`, will ignore it") + kwargs.pop(key) + def text_to_image( self, prompt: str, @@ -346,7 +371,6 @@ def text_to_image( width, height = map(int, re.split(r"[^\d]+", size)) generate_kwargs = self._model_spec.default_generate_config.copy() # type: ignore generate_kwargs.update({k: v for k, v in kwargs.items() if v is not None}) - self._filter_kwargs(generate_kwargs) return self._call_model( prompt=prompt, height=height, @@ -368,7 +392,6 @@ def image_to_image( self, image: PIL.Image, prompt: Optional[Union[str, List[str]]] = None, - negative_prompt: Optional[Union[str, List[str]]] = None, n: int = 1, size: Optional[str] = None, response_format: str = "url", @@ -404,19 +427,10 @@ def image_to_image( kwargs["height"] = height else: # SD3 image2image cannot accept width and height - parameters = inspect.signature(model.__call__).parameters # type: ignore - allow_width_height = False - for param in parameters.values(): - if param.kind == inspect.Parameter.VAR_KEYWORD: - allow_width_height = True - break - if "width" in parameters or "height" in parameters: - allow_width_height = True + allow_width_height = model_accept_param(["width", "height"], model) if allow_width_height: kwargs["width"], kwargs["height"] = image.size - kwargs["negative_prompt"] = negative_prompt - self._filter_kwargs(kwargs) return self._call_model( image=image, prompt=prompt, @@ -431,7 +445,6 @@ def inpainting( image: PIL.Image, mask_image: PIL.Image, prompt: Optional[Union[str, List[str]]] = None, - negative_prompt: Optional[Union[str, List[str]]] = None, n: int = 1, size: str = "1024*1024", response_format: str = "url", @@ -469,8 +482,6 @@ def inpainting( # calculate actual image size after padding width, height = image.size - kwargs["negative_prompt"] = negative_prompt - self._filter_kwargs(kwargs) return self._call_model( image=image, mask_image=mask_image, From 67bd4db700b060948a3b77d6f53920b90c947a6d Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 20 Sep 2024 16:11:16 +0800 Subject: [PATCH 16/17] DOC: update models for doc and readme (#2330) --- README.md | 8 +- README_zh_CN.md | 8 +- doc/source/getting_started/installation.rst | 4 +- .../builtin/audio/fishspeech-1.2-sft.rst | 19 - .../models/builtin/audio/fishspeech-1.4.rst | 19 + doc/source/models/builtin/audio/index.rst | 2 +- .../models/builtin/image/flux.1-dev.rst | 2 +- .../models/builtin/image/flux.1-schnell.rst | 2 +- .../builtin/llm/deepseek-v2-chat-0628.rst | 31 ++ .../models/builtin/llm/deepseek-v2-chat.rst | 47 ++ .../models/builtin/llm/deepseek-v2.5.rst | 31 ++ doc/source/models/builtin/llm/deepseek-v2.rst | 47 ++ doc/source/models/builtin/llm/index.rst | 63 +++ .../builtin/llm/qwen2-audio-instruct.rst | 31 ++ doc/source/models/builtin/llm/qwen2-audio.rst | 31 ++ .../models/builtin/llm/qwen2.5-instruct.rst | 463 ++++++++++++++++++ .../models/builtin/llm/yi-coder-chat.rst | 47 ++ doc/source/models/builtin/llm/yi-coder.rst | 47 ++ doc/source/user_guide/backends.rst | 4 +- 19 files changed, 874 insertions(+), 32 deletions(-) delete mode 100644 doc/source/models/builtin/audio/fishspeech-1.2-sft.rst create mode 100644 doc/source/models/builtin/audio/fishspeech-1.4.rst create mode 100644 doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst create mode 100644 doc/source/models/builtin/llm/deepseek-v2-chat.rst create mode 100644 doc/source/models/builtin/llm/deepseek-v2.5.rst create mode 100644 doc/source/models/builtin/llm/deepseek-v2.rst create mode 100644 doc/source/models/builtin/llm/qwen2-audio-instruct.rst create mode 100644 doc/source/models/builtin/llm/qwen2-audio.rst create mode 100644 doc/source/models/builtin/llm/qwen2.5-instruct.rst create mode 100644 doc/source/models/builtin/llm/yi-coder-chat.rst create mode 100644 doc/source/models/builtin/llm/yi-coder.rst diff --git a/README.md b/README.md index 576dff498e..f478bfd37b 100644 --- a/README.md +++ b/README.md @@ -34,14 +34,14 @@ potential of cutting-edge AI models. - Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929) - Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906) ### New Models +- Built-in support for [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325) +- Built-in support for [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295) +- Built-in support for [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292) +- Built-in support for [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271) - Built-in support for [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205) - Built-in support for [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263) - Built-in support for [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049) - Built-in support for [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007) -- Built-in support for [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031) -- Built-in support for [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028) -- Built-in support for [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008) -- Built-in support for [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944) ### Integrations - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable. - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization. diff --git a/README_zh_CN.md b/README_zh_CN.md index 08a1f80b27..cd155e3997 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -31,14 +31,14 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布 - 支持语音识别模型: [#929](https://github.com/xorbitsai/inference/pull/929) - 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906) ### 新模型 +- 内置 [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325) +- 内置 [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295) +- 内置 [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292) +- 内置 [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271) - 内置 [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205) - 内置 [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263) - 内置 [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049) - 内置 [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007) -- 内置 [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031) -- 内置 [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028) -- 内置 [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008) -- 内置 [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944) ### 集成 - [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/):一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力,帮助您轻松实现复杂的问答场景。 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。 diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index e52384bee7..8490f93439 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -44,7 +44,8 @@ Currently, supported models include: - ``codestral-v0.1`` - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` -- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct`` +- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5`` +- ``yi-coder``, ``yi-coder-chat`` - ``codeqwen1.5``, ``codeqwen1.5-chat`` - ``baichuan-2-chat`` - ``internlm2-chat`` @@ -56,6 +57,7 @@ Currently, supported models include: - ``codegeex4`` - ``qwen1.5-chat``, ``qwen1.5-moe-chat`` - ``qwen2-instruct``, ``qwen2-moe-instruct`` +- ``qwen2.5-instruct`` - ``gemma-it``, ``gemma-2-it`` - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` diff --git a/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst b/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst deleted file mode 100644 index 3afac1f7e3..0000000000 --- a/doc/source/models/builtin/audio/fishspeech-1.2-sft.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. _models_builtin_fishspeech-1.2-sft: - -================== -FishSpeech-1.2-SFT -================== - -- **Model Name:** FishSpeech-1.2-SFT -- **Model Family:** FishAudio -- **Abilities:** text-to-audio -- **Multilingual:** True - -Specifications -^^^^^^^^^^^^^^ - -- **Model ID:** fishaudio/fish-speech-1.2-sft - -Execute the following command to launch the model:: - - xinference launch --model-name FishSpeech-1.2-SFT --model-type audio \ No newline at end of file diff --git a/doc/source/models/builtin/audio/fishspeech-1.4.rst b/doc/source/models/builtin/audio/fishspeech-1.4.rst new file mode 100644 index 0000000000..c256495d67 --- /dev/null +++ b/doc/source/models/builtin/audio/fishspeech-1.4.rst @@ -0,0 +1,19 @@ +.. _models_builtin_fishspeech-1.4: + +============== +FishSpeech-1.4 +============== + +- **Model Name:** FishSpeech-1.4 +- **Model Family:** FishAudio +- **Abilities:** text-to-audio +- **Multilingual:** True + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** fishaudio/fish-speech-1.4 + +Execute the following command to launch the model:: + + xinference launch --model-name FishSpeech-1.4 --model-type audio \ No newline at end of file diff --git a/doc/source/models/builtin/audio/index.rst b/doc/source/models/builtin/audio/index.rst index 8959b2b94f..d4b6b886ac 100644 --- a/doc/source/models/builtin/audio/index.rst +++ b/doc/source/models/builtin/audio/index.rst @@ -25,7 +25,7 @@ The following is a list of built-in audio models in Xinference: cosyvoice-300m-sft - fishspeech-1.2-sft + fishspeech-1.4 sensevoicesmall diff --git a/doc/source/models/builtin/image/flux.1-dev.rst b/doc/source/models/builtin/image/flux.1-dev.rst index 829bcbfd75..3a16cfe0a7 100644 --- a/doc/source/models/builtin/image/flux.1-dev.rst +++ b/doc/source/models/builtin/image/flux.1-dev.rst @@ -6,7 +6,7 @@ FLUX.1-dev - **Model Name:** FLUX.1-dev - **Model Family:** stable_diffusion -- **Abilities:** text2image +- **Abilities:** text2image, image2image, inpainting - **Available ControlNet:** None Specifications diff --git a/doc/source/models/builtin/image/flux.1-schnell.rst b/doc/source/models/builtin/image/flux.1-schnell.rst index 268f5a1720..df82d2069f 100644 --- a/doc/source/models/builtin/image/flux.1-schnell.rst +++ b/doc/source/models/builtin/image/flux.1-schnell.rst @@ -6,7 +6,7 @@ FLUX.1-schnell - **Model Name:** FLUX.1-schnell - **Model Family:** stable_diffusion -- **Abilities:** text2image +- **Abilities:** text2image, image2image, inpainting - **Available ControlNet:** None Specifications diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst new file mode 100644 index 0000000000..d6e91cb248 --- /dev/null +++ b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst @@ -0,0 +1,31 @@ +.. _models_llm_deepseek-v2-chat-0628: + +======================================== +deepseek-v2-chat-0628 +======================================== + +- **Context Length:** 128000 +- **Model Name:** deepseek-v2-chat-0628 +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 236 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 236 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-V2-Chat-0628 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-v2-chat-0628 --size-in-billions 236 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat.rst b/doc/source/models/builtin/llm/deepseek-v2-chat.rst new file mode 100644 index 0000000000..84595c2bbb --- /dev/null +++ b/doc/source/models/builtin/llm/deepseek-v2-chat.rst @@ -0,0 +1,47 @@ +.. _models_llm_deepseek-v2-chat: + +======================================== +deepseek-v2-chat +======================================== + +- **Context Length:** 128000 +- **Model Name:** deepseek-v2-chat +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 16 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 16 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-V2-Lite-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 16 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 236 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 236 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-V2-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 236 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/deepseek-v2.5.rst b/doc/source/models/builtin/llm/deepseek-v2.5.rst new file mode 100644 index 0000000000..5f5b9475d4 --- /dev/null +++ b/doc/source/models/builtin/llm/deepseek-v2.5.rst @@ -0,0 +1,31 @@ +.. _models_llm_deepseek-v2.5: + +======================================== +deepseek-v2.5 +======================================== + +- **Context Length:** 128000 +- **Model Name:** deepseek-v2.5 +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 236 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 236 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-V2.5 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-v2.5 --size-in-billions 236 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/deepseek-v2.rst b/doc/source/models/builtin/llm/deepseek-v2.rst new file mode 100644 index 0000000000..4102b9568c --- /dev/null +++ b/doc/source/models/builtin/llm/deepseek-v2.rst @@ -0,0 +1,47 @@ +.. _models_llm_deepseek-v2: + +======================================== +deepseek-v2 +======================================== + +- **Context Length:** 128000 +- **Model Name:** deepseek-v2 +- **Languages:** en, zh +- **Abilities:** generate +- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 16 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 16 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: Transformers +- **Model ID:** deepseek-ai/DeepSeek-V2-Lite +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 16 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 236 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 236 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: Transformers +- **Model ID:** deepseek-ai/DeepSeek-V2 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 236 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index bab4b1093d..73bd2b9894 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -126,6 +126,26 @@ The following is a list of built-in LLM in Xinference: - 16384 - deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data. + * - :ref:`deepseek-v2 ` + - generate + - 128000 + - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. + + * - :ref:`deepseek-v2-chat ` + - chat + - 128000 + - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. + + * - :ref:`deepseek-v2-chat-0628 ` + - chat + - 128000 + - DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat. + + * - :ref:`deepseek-v2.5 ` + - chat + - 128000 + - DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions. + * - :ref:`deepseek-vl-chat ` - chat, vision - 4096 @@ -371,6 +391,16 @@ The following is a list of built-in LLM in Xinference: - 32768 - Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data. + * - :ref:`qwen2-audio ` + - chat, audio + - 32768 + - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. + + * - :ref:`qwen2-audio-instruct ` + - chat, audio + - 32768 + - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. + * - :ref:`qwen2-instruct ` - chat, tools - 32768 @@ -386,6 +416,11 @@ The following is a list of built-in LLM in Xinference: - 32768 - Qwen2-VL: To See the World More Clearly.Qwen2-VL is the latest version of the vision language models in the Qwen model familities. + * - :ref:`qwen2.5-instruct ` + - chat, tools + - 131072 + - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. + * - :ref:`seallm_v2 ` - generate - 8192 @@ -471,6 +506,16 @@ The following is a list of built-in LLM in Xinference: - 4096 - The Yi series models are large language models trained from scratch by developers at 01.AI. + * - :ref:`yi-coder ` + - generate + - 131072 + - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++. + + * - :ref:`yi-coder-chat ` + - chat + - 131072 + - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++. + * - :ref:`yi-vl-chat ` - chat, vision - 4096 @@ -525,6 +570,14 @@ The following is a list of built-in LLM in Xinference: deepseek-coder-instruct + deepseek-v2 + + deepseek-v2-chat + + deepseek-v2-chat-0628 + + deepseek-v2.5 + deepseek-vl-chat gemma-2-it @@ -623,12 +676,18 @@ The following is a list of built-in LLM in Xinference: qwen1.5-moe-chat + qwen2-audio + + qwen2-audio-instruct + qwen2-instruct qwen2-moe-instruct qwen2-vl-instruct + qwen2.5-instruct + seallm_v2 seallm_v2.5 @@ -663,6 +722,10 @@ The following is a list of built-in LLM in Xinference: yi-chat + yi-coder + + yi-coder-chat + yi-vl-chat diff --git a/doc/source/models/builtin/llm/qwen2-audio-instruct.rst b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst new file mode 100644 index 0000000000..2d126a387e --- /dev/null +++ b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst @@ -0,0 +1,31 @@ +.. _models_llm_qwen2-audio-instruct: + +======================================== +qwen2-audio-instruct +======================================== + +- **Context Length:** 32768 +- **Model Name:** qwen2-audio-instruct +- **Languages:** en, zh +- **Abilities:** chat, audio +- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** none +- **Engines**: Transformers +- **Model ID:** Qwen/Qwen2-Audio-7B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2-audio-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/qwen2-audio.rst b/doc/source/models/builtin/llm/qwen2-audio.rst new file mode 100644 index 0000000000..2973390c44 --- /dev/null +++ b/doc/source/models/builtin/llm/qwen2-audio.rst @@ -0,0 +1,31 @@ +.. _models_llm_qwen2-audio: + +======================================== +qwen2-audio +======================================== + +- **Context Length:** 32768 +- **Model Name:** qwen2-audio +- **Languages:** en, zh +- **Abilities:** chat, audio +- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** none +- **Engines**: Transformers +- **Model ID:** Qwen/Qwen2-Audio-7B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2-audio --size-in-billions 7 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/qwen2.5-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-instruct.rst new file mode 100644 index 0000000000..6e6b4db35e --- /dev/null +++ b/doc/source/models/builtin/llm/qwen2.5-instruct.rst @@ -0,0 +1,463 @@ +.. _models_llm_qwen2.5-instruct: + +======================================== +qwen2.5-instruct +======================================== + +- **Context Length:** 131072 +- **Model Name:** qwen2.5-instruct +- **Languages:** en, zh +- **Abilities:** chat, tools +- **Description:** Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 0_5 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_5 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 3 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 3 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-3B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-7B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 5 (pytorch, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 14 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-14B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format pytorch --quantization ${quantization} + + +Model Spec 6 (pytorch, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 32 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-32B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format pytorch --quantization ${quantization} + + +Model Spec 7 (pytorch, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 72 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** Qwen/Qwen2.5-72B-Instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization} + + +Model Spec 8 (gptq, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 0_5 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format gptq --quantization ${quantization} + + +Model Spec 9 (gptq, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 1_5 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format gptq --quantization ${quantization} + + +Model Spec 10 (gptq, 3 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 3 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format gptq --quantization ${quantization} + + +Model Spec 11 (gptq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format gptq --quantization ${quantization} + + +Model Spec 12 (gptq, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 14 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format gptq --quantization ${quantization} + + +Model Spec 13 (gptq, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 32 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format gptq --quantization ${quantization} + + +Model Spec 14 (gptq, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 72 +- **Quantizations:** Int4, Int8 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format gptq --quantization ${quantization} + + +Model Spec 15 (awq, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 0_5 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format awq --quantization ${quantization} + + +Model Spec 16 (awq, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 1_5 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format awq --quantization ${quantization} + + +Model Spec 17 (awq, 3 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 3 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-3B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format awq --quantization ${quantization} + + +Model Spec 18 (awq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-7B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format awq --quantization ${quantization} + + +Model Spec 19 (awq, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 14 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-14B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format awq --quantization ${quantization} + + +Model Spec 20 (awq, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 32 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-32B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format awq --quantization ${quantization} + + +Model Spec 21 (awq, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 72 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** Qwen/Qwen2.5-72B-Instruct-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format awq --quantization ${quantization} + + +Model Spec 22 (ggufv2, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 0_5 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 23 (ggufv2, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 1_5 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 24 (ggufv2, 3 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 3 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 25 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 26 (ggufv2, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 14 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 27 (ggufv2, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 32 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 28 (ggufv2, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 72 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/yi-coder-chat.rst b/doc/source/models/builtin/llm/yi-coder-chat.rst new file mode 100644 index 0000000000..af4368ae98 --- /dev/null +++ b/doc/source/models/builtin/llm/yi-coder-chat.rst @@ -0,0 +1,47 @@ +.. _models_llm_yi-coder-chat: + +======================================== +yi-coder-chat +======================================== + +- **Context Length:** 131072 +- **Model Name:** yi-coder-chat +- **Languages:** en +- **Abilities:** chat +- **Description:** Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 9 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 9 +- **Quantizations:** none +- **Engines**: vLLM, Transformers +- **Model ID:** 01ai/Yi-Coder-9B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name yi-coder-chat --size-in-billions 9 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_5 +- **Quantizations:** none +- **Engines**: vLLM, Transformers +- **Model ID:** 01ai/Yi-Coder-1.5B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name yi-coder-chat --size-in-billions 1_5 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/yi-coder.rst b/doc/source/models/builtin/llm/yi-coder.rst new file mode 100644 index 0000000000..347a3bc9d1 --- /dev/null +++ b/doc/source/models/builtin/llm/yi-coder.rst @@ -0,0 +1,47 @@ +.. _models_llm_yi-coder: + +======================================== +yi-coder +======================================== + +- **Context Length:** 131072 +- **Model Name:** yi-coder +- **Languages:** en +- **Abilities:** generate +- **Description:** Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 9 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 9 +- **Quantizations:** none +- **Engines**: vLLM, Transformers +- **Model ID:** 01-ai/Yi-Coder-9B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name yi-coder --size-in-billions 9 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_5 +- **Quantizations:** none +- **Engines**: vLLM, Transformers +- **Model ID:** 01-ai/Yi-Coder-1.5B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name yi-coder --size-in-billions 1_5 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst index 57126871e8..2cbb924e03 100644 --- a/doc/source/user_guide/backends.rst +++ b/doc/source/user_guide/backends.rst @@ -51,7 +51,8 @@ Currently, supported model includes: - ``codestral-v0.1`` - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` -- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct`` +- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5`` +- ``yi-coder``, ``yi-coder-chat`` - ``codeqwen1.5``, ``codeqwen1.5-chat`` - ``baichuan-2-chat`` - ``internlm2-chat`` @@ -63,6 +64,7 @@ Currently, supported model includes: - ``codegeex4`` - ``qwen1.5-chat``, ``qwen1.5-moe-chat`` - ``qwen2-instruct``, ``qwen2-moe-instruct`` +- ``qwen2.5-instruct`` - ``gemma-it``, ``gemma-2-it`` - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` From 5de46e94c23785fa7e17e3e1d00c3afb6cb1c919 Mon Sep 17 00:00:00 2001 From: amumu96 <128140880+amumu96@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:58:06 +0800 Subject: [PATCH 17/17] FEAT: support qwen2.5-coder-instruct and qwen2.5 sglang (#2332) Co-authored-by: wuzhaoxin <15667065080@162.com> --- xinference/model/llm/llm_family.json | 550 ++++++++++++++++- .../model/llm/llm_family_modelscope.json | 565 +++++++++++++++++- xinference/model/llm/sglang/core.py | 4 + xinference/model/llm/vllm/core.py | 4 + 4 files changed, 1062 insertions(+), 61 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 70b17daa61..471b4febc3 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -6874,7 +6874,7 @@ "model_id":"Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8", "model_revision":"3d152a77eaccfd72d59baedb0b183a1b8fd56e48" }, - { + { "model_format":"gptq", "model_size_in_billions":7, "quantizations":[ @@ -6883,7 +6883,7 @@ "model_id":"Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4", "model_revision":"5ab897112fa83b9699826be8753ef9184585c77d" }, - { + { "model_format":"awq", "model_size_in_billions":7, "quantizations":[ @@ -6891,6 +6891,31 @@ ], "model_id":"Qwen/Qwen2-VL-7B-Instruct-AWQ", "model_revision":"f94216e8b513933bccd567bcd9b7350199f32538" + }, + { + "model_format":"pytorch", + "model_size_in_billions":72, + "quantizations":[ + "none" + ], + "model_id":"Qwen/Qwen2-VL-72B-Instruct" + }, + { + "model_format":"awq", + "model_size_in_billions":72, + "quantizations":[ + "Int4" + ], + "model_id":"Qwen/Qwen2-VL-72B-Instruct-AWQ" + }, + { + "model_format":"gptq", + "model_size_in_billions":72, + "quantizations":[ + "Int4", + "Int8" + ], + "model_id":"Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}" } ], "prompt_style":{ @@ -7247,7 +7272,99 @@ }, { "version": 1, - "context_length": 131072, + "context_length": 32768, + "model_name": "qwen2.5", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-0.5B", + "model_revision": "2630d3d2321bc1f1878f702166d1b2af019a7310" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-1.5B", + "model_revision": "e5dfabbcffd9b0c7b31d89b82c5a6b72e663f32c" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 3, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-3B", + "model_revision": "e4aa5ac50aa507415cda96cc99eb77ad0a3d2d34" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-7B", + "model_revision": "09a0bac5707b43ec44508eab308b0846320c1ed4" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-14B", + "model_revision": "d02b64ba1ce86bf9948668a13f82709600431ccc" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-32B", + "model_revision": "ff23665d01c3665be5fdb271d18a62090b65c06d" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-72B", + "model_revision": "587cc4061cf6a7cc0d429d05c109447e5cf063af" + } + ] + }, + { + "version": 1, + "context_length": 32768, "model_name": "qwen2.5-instruct", "model_lang": [ "en", @@ -7459,11 +7576,10 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf" + "model_file_name_template": "qwen2.5-0.5b-instruct-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -7476,11 +7592,10 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf" + "model_file_name_template": "qwen2.5-1.5b-instruct-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -7493,11 +7608,10 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf" + "model_file_name_template": "qwen2.5-3b-instruct-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -7510,11 +7624,37 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf" + "model_file_name_template": "qwen2.5-7b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q4_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q4_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00002", + "00002-of-00002" + ] + } }, { "model_format": "ggufv2", @@ -7527,11 +7667,53 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf" + "model_file_name_template": "qwen2.5-14b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-14b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q2_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q3_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q4_0": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q4_k_m": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q5_0": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q5_k_m": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q6_k": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ], + "q8_0": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ] + } }, { "model_format": "ggufv2", @@ -7544,11 +7726,76 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf" + "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-32b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q2_k": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ], + "q3_k_m": [ + "00001-of-00005", + "00002-of-00005", + "00003-of-00005", + "00004-of-00005", + "00005-of-00005" + ], + "q4_0": [ + "00001-of-00005", + "00002-of-00005", + "00003-of-00005", + "00004-of-00005", + "00005-of-00005" + ], + "q4_k_m": [ + "00001-of-00005", + "00002-of-00005", + "00003-of-00005", + "00004-of-00005", + "00005-of-00005" + ], + "q5_0": [ + "00001-of-00006", + "00002-of-00006", + "00003-of-00006", + "00004-of-00006", + "00005-of-00006", + "00006-of-00006" + ], + "q5_k_m": [ + "00001-of-00006", + "00002-of-00006", + "00003-of-00006", + "00004-of-00006", + "00005-of-00006", + "00006-of-00006" + ], + "q6_k": [ + "00001-of-00007", + "00002-of-00007", + "00003-of-00007", + "00004-of-00007", + "00005-of-00007", + "00006-of-00007", + "00007-of-00007" + ], + "q8_0": [ + "00001-of-00009", + "00002-of-00009", + "00003-of-00009", + "00004-of-00009", + "00005-of-00009", + "00006-of-00009", + "00007-of-00009", + "00008-of-00009", + "00009-of-00009" + ] + } }, { "model_format": "ggufv2", @@ -7566,8 +7813,254 @@ ], "model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF", "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf", - "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf", + "model_file_name_split_template": "qwen2.5-72b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q2_k": [ + "00001-of-00007", + "00002-of-00007", + "00003-of-00007", + "00004-of-00007", + "00005-of-00007", + "00006-of-00007", + "00007-of-00007" + ], + "q3_k_m": [ + "00001-of-00009", + "00002-of-00009", + "00003-of-00009", + "00004-of-00009", + "00005-of-00009", + "00006-of-00009", + "00007-of-00009", + "00008-of-00009", + "00009-of-00009" + ], + "q4_0": [ + "00001-of-00011", + "00002-of-00011", + "00003-of-00011", + "00004-of-00011", + "00005-of-00011", + "00006-of-00011", + "00007-of-00011", + "00008-of-00011", + "00009-of-00011", + "00010-of-00011", + "00011-of-00011" + ], + "q4_k_m": [ + "00001-of-00012", + "00002-of-00012", + "00003-of-00012", + "00004-of-00012", + "00005-of-00012", + "00006-of-00012", + "00007-of-00012", + "00008-of-00012", + "00009-of-00012", + "00010-of-00012", + "00011-of-00012", + "00012-of-00012" + ], + "q5_0": [ + "00001-of-00013", + "00002-of-00013", + "00003-of-00013", + "00004-of-00013", + "00005-of-00013", + "00006-of-00013", + "00007-of-00013", + "00008-of-00013", + "00009-of-00013", + "00010-of-00013", + "00011-of-00013", + "00012-of-00013", + "00013-of-00013" + ], + "q5_k_m": [ + "00001-of-00014", + "00002-of-00014", + "00003-of-00014", + "00004-of-00014", + "00005-of-00014", + "00006-of-00014", + "00007-of-00014", + "00008-of-00014", + "00009-of-00014", + "00010-of-00014", + "00011-of-00014", + "00012-of-00014", + "00013-of-00014", + "00014-of-00014" + ], + "q6_k": [ + "00001-of-00016", + "00002-of-00016", + "00003-of-00016", + "00004-of-00016", + "00005-of-00016", + "00006-of-00016", + "00007-of-00016", + "00008-of-00016", + "00009-of-00016", + "00010-of-00016", + "00011-of-00016", + "00012-of-00016", + "00013-of-00016", + "00014-of-00016", + "00015-of-00016", + "00016-of-00016" + ], + "q8_0": [ + "00001-of-00021", + "00002-of-00021", + "00003-of-00021", + "00004-of-00021", + "00005-of-00021", + "00006-of-00021", + "00007-of-00021", + "00008-of-00021", + "00009-of-00021", + "00010-of-00021", + "00011-of-00021", + "00012-of-00021", + "00013-of-00021", + "00014-of-00021", + "00015-of-00021", + "00016-of-00021", + "00017-of-00021", + "00018-of-00021", + "00019-of-00021", + "00020-of-00021", + "00021-of-00021" + ] + } + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen2.5-coder", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-1.5B", + "model_revision": "d3586cfe793730945f8e4d7ef31032a3ee50247d" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-7B", + "model_revision": "30b6a7e874a78d46b80fa1db3194ea427dd41b08" + } + ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen2.5-coder-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0" + ], + "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", + "model_file_name_template": "qwen2.5-coder-1.5b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0" + ], + "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF", + "model_file_name_template": "qwen2.5-coder-7b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-coder-7b-instruct-{quantization}-{part}.gguf", "quantization_parts": { + "q4_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q4_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], "q5_0": [ "00001-of-00002", "00002-of-00002" @@ -7581,19 +8074,14 @@ "00002-of-00002" ], "q8_0": [ - "00001-of-00002", - "00002-of-00002" - ], - "fp16": [ - "00001-of-00004", - "00002-of-00004", - "00003-of-00004", - "00004-of-00004" + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" ] } } ], - "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", "stop_token_ids": [ 151643, 151644, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 7309ee9651..daf726e8c7 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4602,6 +4602,34 @@ "model_hub": "modelscope", "model_id":"qwen/Qwen2-VL-2B-Instruct-AWQ", "model_revision":"master" + }, + { + "model_format":"pytorch", + "model_size_in_billions":72, + "quantizations":[ + "none" + ], + "model_id":"qwen/Qwen2-VL-72B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format":"awq", + "model_size_in_billions":72, + "quantizations":[ + "Int4" + ], + "model_id":"qwen/Qwen2-VL-72B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format":"gptq", + "model_size_in_billions":72, + "quantizations":[ + "Int4", + "Int8" + ], + "model_id":"qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" } ], "prompt_style": { @@ -4960,7 +4988,106 @@ }, { "version": 1, - "context_length": 131072, + "context_length": 32768, + "model_name": "qwen2.5", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-0.5B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-1.5B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 3, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-3B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-7B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-14B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-32B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-72B", + "model_revision": "master", + "model_hub": "modelscope" + } + ] + }, + { + "version": 1, + "context_length": 32768, "model_name": "qwen2.5-instruct", "model_lang": [ "en", @@ -5193,11 +5320,10 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf", + "model_file_name_template": "qwen2.5-0.5b-instruct-{quantization}.gguf", "model_hub": "modelscope" }, { @@ -5211,11 +5337,10 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf", + "model_file_name_template": "qwen2.5-1.5b-instruct-{quantization}.gguf", "model_hub": "modelscope" }, { @@ -5229,11 +5354,10 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-3B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf", + "model_file_name_template": "qwen2.5-3b-instruct-{quantization}.gguf", "model_hub": "modelscope" }, { @@ -5247,12 +5371,38 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF", "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf", - "model_hub": "modelscope" + "model_hub": "modelscope", + "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q4_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q4_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00002", + "00002-of-00002" + ] + } }, { "model_format": "ggufv2", @@ -5265,11 +5415,53 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-14B-Instruct-GGUF", - "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf", + "model_file_name_template": "qwen2.5-14b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-14b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q2_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q3_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q4_0": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q4_k_m": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q5_0": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q5_k_m": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" + ], + "q6_k": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ], + "q8_0": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ] + }, "model_hub": "modelscope" }, { @@ -5283,11 +5475,76 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-32B-Instruct-GGUF", "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-32b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q2_k": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ], + "q3_k_m": [ + "00001-of-00005", + "00002-of-00005", + "00003-of-00005", + "00004-of-00005", + "00005-of-00005" + ], + "q4_0": [ + "00001-of-00005", + "00002-of-00005", + "00003-of-00005", + "00004-of-00005", + "00005-of-00005" + ], + "q4_k_m": [ + "00001-of-00005", + "00002-of-00005", + "00003-of-00005", + "00004-of-00005", + "00005-of-00005" + ], + "q5_0": [ + "00001-of-00006", + "00002-of-00006", + "00003-of-00006", + "00004-of-00006", + "00005-of-00006", + "00006-of-00006" + ], + "q5_k_m": [ + "00001-of-00006", + "00002-of-00006", + "00003-of-00006", + "00004-of-00006", + "00005-of-00006", + "00006-of-00006" + ], + "q6_k": [ + "00001-of-00007", + "00002-of-00007", + "00003-of-00007", + "00004-of-00007", + "00005-of-00007", + "00006-of-00007", + "00007-of-00007" + ], + "q8_0": [ + "00001-of-00009", + "00002-of-00009", + "00003-of-00009", + "00004-of-00009", + "00005-of-00009", + "00006-of-00009", + "00007-of-00009", + "00008-of-00009", + "00009-of-00009" + ] + }, "model_hub": "modelscope" }, { @@ -5301,40 +5558,288 @@ "q5_0", "q5_k_m", "q6_k", - "q8_0", - "fp16" + "q8_0" ], "model_id": "qwen/Qwen2.5-72B-Instruct-GGUF", "model_hub": "modelscope", "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf", - "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf", + "model_file_name_split_template": "qwen2.5-72b-instruct-{quantization}-{part}.gguf", "quantization_parts": { + "q2_k": [ + "00001-of-00007", + "00002-of-00007", + "00003-of-00007", + "00004-of-00007", + "00005-of-00007", + "00006-of-00007", + "00007-of-00007" + ], + "q3_k_m": [ + "00001-of-00009", + "00002-of-00009", + "00003-of-00009", + "00004-of-00009", + "00005-of-00009", + "00006-of-00009", + "00007-of-00009", + "00008-of-00009", + "00009-of-00009" + ], + "q4_0": [ + "00001-of-00011", + "00002-of-00011", + "00003-of-00011", + "00004-of-00011", + "00005-of-00011", + "00006-of-00011", + "00007-of-00011", + "00008-of-00011", + "00009-of-00011", + "00010-of-00011", + "00011-of-00011" + ], + "q4_k_m": [ + "00001-of-00012", + "00002-of-00012", + "00003-of-00012", + "00004-of-00012", + "00005-of-00012", + "00006-of-00012", + "00007-of-00012", + "00008-of-00012", + "00009-of-00012", + "00010-of-00012", + "00011-of-00012", + "00012-of-00012" + ], "q5_0": [ + "00001-of-00013", + "00002-of-00013", + "00003-of-00013", + "00004-of-00013", + "00005-of-00013", + "00006-of-00013", + "00007-of-00013", + "00008-of-00013", + "00009-of-00013", + "00010-of-00013", + "00011-of-00013", + "00012-of-00013", + "00013-of-00013" + ], + "q5_k_m": [ + "00001-of-00014", + "00002-of-00014", + "00003-of-00014", + "00004-of-00014", + "00005-of-00014", + "00006-of-00014", + "00007-of-00014", + "00008-of-00014", + "00009-of-00014", + "00010-of-00014", + "00011-of-00014", + "00012-of-00014", + "00013-of-00014", + "00014-of-00014" + ], + "q6_k": [ + "00001-of-00016", + "00002-of-00016", + "00003-of-00016", + "00004-of-00016", + "00005-of-00016", + "00006-of-00016", + "00007-of-00016", + "00008-of-00016", + "00009-of-00016", + "00010-of-00016", + "00011-of-00016", + "00012-of-00016", + "00013-of-00016", + "00014-of-00016", + "00015-of-00016", + "00016-of-00016" + ], + "q8_0": [ + "00001-of-00021", + "00002-of-00021", + "00003-of-00021", + "00004-of-00021", + "00005-of-00021", + "00006-of-00021", + "00007-of-00021", + "00008-of-00021", + "00009-of-00021", + "00010-of-00021", + "00011-of-00021", + "00012-of-00021", + "00013-of-00021", + "00014-of-00021", + "00015-of-00021", + "00016-of-00021", + "00017-of-00021", + "00018-of-00021", + "00019-of-00021", + "00020-of-00021", + "00021-of-00021" + ] + } + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen2.5-coder", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-1.5B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-7B", + "model_revision": "master", + "model_hub": "modelscope" + } + ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen2.5-coder-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-7B-Instruct", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0" + ], + "model_hub": "modelscope", + "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", + "model_file_name_template": "qwen2.5-coder-1.5b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0" + ], + "model_hub": "modelscope", + "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GGUF", + "model_file_name_template": "qwen2.5-coder-7b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2.5-coder-7b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q4_0": [ "00001-of-00002", "00002-of-00002" ], - "q5_k_m": [ + "q4_k_m": [ "00001-of-00002", "00002-of-00002" ], - "q6_k": [ + "q5_0": [ "00001-of-00002", "00002-of-00002" ], - "q8_0": [ + "q5_k_m": [ "00001-of-00002", "00002-of-00002" ], - "fp16": [ - "00001-of-00004", - "00002-of-00004", - "00003-of-00004", - "00004-of-00004" + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00003", + "00002-of-00003", + "00003-of-00003" ] } } ], - "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", "stop_token_ids": [ 151643, 151644, diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 621b9b0a59..a413f2ad0f 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -68,6 +68,8 @@ class SGLANGGenerateConfig(TypedDict, total=False): "llama-3.1", "mistral-v0.1", "mixtral-v0.1", + "qwen2.5", + "qwen2.5-coder", ] SGLANG_SUPPORTED_CHAT_MODELS = [ "llama-2-chat", @@ -85,6 +87,8 @@ class SGLANGGenerateConfig(TypedDict, total=False): "deepseek-v2.5", "deepseek-v2-chat", "deepseek-v2-chat-0628", + "qwen2.5-instruct", + "qwen2.5-coder-instruct", ] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 3aaee0738f..8b28701778 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -138,7 +138,11 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_MODELS.append("codeqwen1.5") VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat") VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct") + VLLM_SUPPORTED_MODELS.append("qwen2.5") VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct") + VLLM_SUPPORTED_MODELS.append("qwen2.5-coder") + VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct") + if VLLM_INSTALLED and vllm.__version__ >= "0.3.2": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")