From 36980c67d63874b80742bf4f0a317cea7876651b Mon Sep 17 00:00:00 2001 From: JunHowie Date: Thu, 19 Sep 2024 00:05:51 +0800 Subject: [PATCH 1/2] Support Qwen 2.5 --- xinference/model/llm/llm_family.json | 405 ++++++++++++++++ .../model/llm/llm_family_modelscope.json | 435 ++++++++++++++++++ xinference/model/llm/vllm/core.py | 7 +- 3 files changed, 841 insertions(+), 6 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 77dda1a84d..0f82301d9b 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -6947,6 +6947,51 @@ "" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "yi-coder-chat", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "none" + ], + "model_id": "01ai/Yi-Coder-9B-Chat", + "model_revision": "356a1f8d4e4a606d0b879e54191ca809918576b8" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "none" + ], + "model_id": "01ai/Yi-Coder-1.5B-Chat", + "model_revision": "92fdd1b2f1539ac990e7f4a921db5601da2f0299" + } + ], + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 6, + 7 + ], + "stop": [ + "<|startoftext|>", + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + }, { "version":1, "context_length":32768, @@ -7244,5 +7289,365 @@ "model_revision": "00e59e64f47d3c78e4cfbdd345888479797e8109" } ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "qwen2.5-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 3, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct-AWQ" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "0_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-1.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 3, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-3B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-7B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-14B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-32B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 72, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "Qwen/Qwen2.5-72B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q5_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "fp16": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ] + } + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index fdaab458aa..dc7be1c829 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4656,6 +4656,53 @@ "" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "yi-coder-chat", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-Coder-9B-Chat", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-Coder-1.5B-Chat", + "model_revision": "master" + } + ], + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", + "stop_token_ids": [ + 1, + 2, + 6, + 7 + ], + "stop": [ + "<|startoftext|>", + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + }, { "version": 1, "context_length": 32768, @@ -4957,5 +5004,393 @@ "model_revision": "master" } ] + }, + { + "version": 1, + "context_length": 131072, + "model_name": "qwen2.5-instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-0.5B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-1.5B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 3, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions":14, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "0_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-0.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-0_5b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-1.5B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-1_5b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 3, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-3B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-3b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-14B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-14b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-32B-Instruct-GGUF", + "model_file_name_template": "qwen2_5-32b-instruct-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 72, + "quantizations": [ + "q2_k", + "q3_k_m", + "q4_0", + "q4_k_m", + "q5_0", + "q5_k_m", + "q6_k", + "q8_0", + "fp16" + ], + "model_id": "qwen/Qwen2.5-72B-Instruct-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen2_5-72b-instruct-{quantization}.gguf", + "model_file_name_split_template": "qwen2_5-72b-instruct-{quantization}-{part}.gguf", + "quantization_parts": { + "q5_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "q5_k_m": [ + "00001-of-00002", + "00002-of-00002" + ], + "q6_k": [ + "00001-of-00002", + "00002-of-00002" + ], + "q8_0": [ + "00001-of-00002", + "00002-of-00002" + ], + "fp16": [ + "00001-of-00004", + "00002-of-00004", + "00003-of-00004", + "00004-of-00004" + ] + } + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{{\\\"name\\\": , \\\"arguments\\\": }}\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] } ] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 811fd5d342..740e33cfe6 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -132,6 +132,7 @@ class VLLMGenerateConfig(TypedDict, total=False): "deepseek-chat", "deepseek-coder-instruct", "yi-coder-chat", + "qwen2.5-instruct", ] if VLLM_INSTALLED and vllm.__version__ >= "0.3.0": VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat") @@ -151,12 +152,6 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct") VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01") -if VLLM_INSTALLED and vllm.__version__ >= "0.5.1": - VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat") - VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628") - VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5") - - if VLLM_INSTALLED and vllm.__version__ >= "0.5.3": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it") VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct") From 46c6285c4d69f7d7ddf2744b06eb2b8ae1d97920 Mon Sep 17 00:00:00 2001 From: qinxuye Date: Thu, 19 Sep 2024 00:32:21 +0800 Subject: [PATCH 2/2] fix --- xinference/model/llm/llm_family.json | 45 ------------------ .../model/llm/llm_family_modelscope.json | 47 ------------------- xinference/model/llm/vllm/core.py | 8 +++- 3 files changed, 7 insertions(+), 93 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 0f82301d9b..70b17daa61 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -6947,51 +6947,6 @@ "" ] }, - { - "version": 1, - "context_length": 131072, - "model_name": "yi-coder-chat", - "model_lang": [ - "en" - ], - "model_ability": [ - "chat" - ], - "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 9, - "quantizations": [ - "none" - ], - "model_id": "01ai/Yi-Coder-9B-Chat", - "model_revision": "356a1f8d4e4a606d0b879e54191ca809918576b8" - }, - { - "model_format": "pytorch", - "model_size_in_billions": "1_5", - "quantizations": [ - "none" - ], - "model_id": "01ai/Yi-Coder-1.5B-Chat", - "model_revision": "92fdd1b2f1539ac990e7f4a921db5601da2f0299" - } - ], - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", - "stop_token_ids": [ - 1, - 2, - 6, - 7 - ], - "stop": [ - "<|startoftext|>", - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - }, { "version":1, "context_length":32768, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index dc7be1c829..7309ee9651 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4656,53 +4656,6 @@ "" ] }, - { - "version": 1, - "context_length": 131072, - "model_name": "yi-coder-chat", - "model_lang": [ - "en" - ], - "model_ability": [ - "chat" - ], - "model_description": "Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.", - "model_specs": [ - { - "model_format": "pytorch", - "model_size_in_billions": 9, - "quantizations": [ - "none" - ], - "model_hub": "modelscope", - "model_id": "01ai/Yi-Coder-9B-Chat", - "model_revision": "master" - }, - { - "model_format": "pytorch", - "model_size_in_billions": "1_5", - "quantizations": [ - "none" - ], - "model_hub": "modelscope", - "model_id": "01ai/Yi-Coder-1.5B-Chat", - "model_revision": "master" - } - ], - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}", - "stop_token_ids": [ - 1, - 2, - 6, - 7 - ], - "stop": [ - "<|startoftext|>", - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>" - ] - }, { "version": 1, "context_length": 32768, diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 740e33cfe6..3aaee0738f 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -132,13 +132,13 @@ class VLLMGenerateConfig(TypedDict, total=False): "deepseek-chat", "deepseek-coder-instruct", "yi-coder-chat", - "qwen2.5-instruct", ] if VLLM_INSTALLED and vllm.__version__ >= "0.3.0": VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat") VLLM_SUPPORTED_MODELS.append("codeqwen1.5") VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat") VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct") + VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct") if VLLM_INSTALLED and vllm.__version__ >= "0.3.2": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it") @@ -152,6 +152,12 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct") VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01") +if VLLM_INSTALLED and vllm.__version__ >= "0.5.1": + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat") + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628") + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5") + + if VLLM_INSTALLED and vllm.__version__ >= "0.5.3": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it") VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")