update acc_strategy & fix citest (#2583)

modelscope · Dec 8, 2024 · bf3e074 · bf3e074
1 parent ca312cf
commit bf3e074
Show file tree

Hide file tree

Showing 28 changed files with 227 additions and 752 deletions.
diff --git a/README.md b/README.md
@@ -255,8 +255,8 @@ pip install ms-swift -U
 - Method 2: Install SWIFT through source code (convenient for running training and inference scripts), please run the following commands:
 
 ```shell
-git clone https://github.com/modelscope/swift.git
-cd swift
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
 pip install -e '.[llm]'
 ```
 

diff --git a/README_CN.md b/README_CN.md
@@ -256,8 +256,8 @@ pip install ms-swift -U
 - 方法2：通过源代码安装SWIFT（方便运行训练推理脚本），请运行以下命令：
 
 ```shell
-git clone https://github.com/modelscope/swift.git
-cd swift
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
 pip install -e '.[llm]'
 ```
 

diff --git a/docs/source/GetStarted/快速开始.md b/docs/source/GetStarted/快速开始.md
@@ -46,6 +46,7 @@ swift sft \
               swift/self-cognition#500 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
@@ -56,8 +57,11 @@ swift sft \
     --save_total_limit 2 \
     --logging_steps 5 \
     --max_length 2048 \
-    --model_author swift \
     --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --model_author swift \
     --model_name swift-robot
 ```
 

diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -292,7 +292,7 @@ Vera使用`target_modules`, `target_regex`, `modules_to_save`三个参数.
 - packing: 是否使用packing，默认为False
 - 🔥lazy_tokenize: 是否使用lazy_tokenize，在LLM训练中默认False，MLLM训练中默认True
 
-- acc_strategy: 训练acc的策略，可以为`sentence`和`token`级别的acc，默认为`token`
+- acc_strategy: 训练acc的策略，可以为`seq`和`token`级别的acc，默认为`token`
 - max_new_tokens: predict_with_generate=True时的最大生成token数量，默认64
 - temperature: predict_with_generate=True时的temperature，默认0
 - optimizer: plugin的自定义optimizer名称

diff --git a/docs/source/Instruction/评测.md b/docs/source/Instruction/评测.md
@@ -50,8 +50,8 @@ pip install ms-swift[eval] -U
 或从源代码安装：
 
 ```shell
-git clone https://github.com/modelscope/swift.git
-cd swift
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
 pip install -e '.[eval]'
 ```
 

diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
@@ -46,6 +46,7 @@ swift sft \
               swift/self-cognition#500 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
@@ -56,8 +57,11 @@ swift sft \
     --save_total_limit 2 \
     --logging_steps 5 \
     --max_length 2048 \
-    --model_author swift \
     --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --model_author swift \
     --model_name swift-robot
 ```
 

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -297,7 +297,7 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine
 - packing: Whether to use packing, default is False.
 - 🔥lazy_tokenize: Whether to use lazy_tokenize, default is False during LLM training, default is True during MLLM training.
 
-- acc_strategy: Strategy for training accuracy, can be `sentence` or `token` level accuracy, default is `token`.
+- acc_strategy: Strategy for training accuracy, can be `seq` or `token` level accuracy, default is `token`.
 - max_new_tokens: Maximum generated token count when `predict_with_generate=True`, default 64.
 - temperature: Temperature when `predict_with_generate=True`, default 0.
 - optimizer: Custom optimizer name for plugin.

diff --git a/docs/source_en/Instruction/Evaluation.md b/docs/source_en/Instruction/Evaluation.md
@@ -50,8 +50,8 @@ pip install ms-swift[eval] -U
 Or install from the source code:
 
 ```shell
-git clone https://github.com/modelscope/swift.git
-cd swift
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
 pip install -e '.[eval]'
 ```
 

diff --git a/examples/custom/infer.sh b/examples/custom/infer.sh
@@ -0,0 +1,9 @@
+# sh examples/custom/infer.sh
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --ckpt_dir output/vx-xxx/checkpoint-xxx \
+    --load_dataset_config true \
+    --infer_backend pt \
+    --max_batch_size 16 \
+    --max_new_tokens 256 \
+    --temperature 0
diff --git a/examples/custom/sft.sh b/examples/custom/sft.sh
@@ -0,0 +1,24 @@
+# sh examples/custom/sft.sh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --custom_register_path examples/custom/dataset.py \
+                           examples/custom/model.py \
+    --model AI-ModelScope/Nemotron-Mini-4B-Instruct \
+    --train_type lora \
+    --dataset swift/stsb \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --max_length 2048 \
+    --output_dir output
diff --git a/requirements/framework.txt b/requirements/framework.txt
@@ -21,6 +21,6 @@ rouge
 safetensors
 tensorboard
 tqdm
-transformers>=4.33,<4.48
+transformers>=4.33,<4.47
 transformers_stream_generator
 trl>=0.11,<0.12
diff --git a/swift/hub/hub.py b/swift/hub/hub.py
@@ -193,7 +193,7 @@ def try_login(cls, token: Optional[str] = None) -> bool:
         from modelscope import HubApi
         if token is None:
             token = os.environ.get('MODELSCOPE_API_TOKEN')
-        if token is not None:
+        if token:
             api = HubApi()
             api.login(token)
             return True

diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
@@ -2,7 +2,7 @@
 import os
 import sys
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Dict, List, Literal, Optional
 
 import json
 import torch
@@ -40,7 +40,7 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ
         load_dataset_config (bool): Flag to determine if dataset configuration should be loaded. Default is False.
         use_hf (bool): Flag to determine if Hugging Face should be used. Default is False.
         hub_token (Optional[str]): SDK token for authentication. Default is None.
-        custom_register_path (Optional[str]): Path to custom .py file for dataset registration. Default is None.
+        custom_register_path (List[str]): Path to custom .py file for dataset registration. Default is None.
         ignore_args_error (bool): Flag to ignore argument errors for notebook compatibility. Default is False.
         use_swift_lora (bool): Use swift lora, a compatible argument
     """
@@ -55,20 +55,19 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ
     # None: use env var `MODELSCOPE_API_TOKEN`
     hub_token: Optional[str] = field(
         default=None, metadata={'help': 'SDK token can be found in https://modelscope.cn/my/myaccesstoken'})
-    custom_register_path: Optional[str] = None  # .py
+    custom_register_path: List[str] = field(default_factory=list)  # .py
 
     # extra
     ignore_args_error: bool = False  # True: notebook compatibility
     use_swift_lora: bool = False  # True for using tuner_backend == swift, don't specify this unless you know what you are doing # noqa
 
     def _init_custom_register(self) -> None:
         """Register custom .py file to datasets"""
-        if self.custom_register_path is None:
-            return
         self.custom_register_path = to_abspath(self.custom_register_path, True)
-        folder, fname = os.path.split(self.custom_register_path)
-        sys.path.append(folder)
-        __import__(fname.rstrip('.py'))
+        for path in self.custom_register_path:
+            folder, fname = os.path.split(path)
+            sys.path.append(folder)
+            __import__(fname.rstrip('.py'))
         logger.info(f'Successfully registered `{self.custom_register_path}`')
 
     def __post_init__(self):
@@ -127,7 +126,12 @@ def load_args_from_ckpt(self, checkpoint_dir: str) -> None:
             'bnb_4bit_quant_type', 'bnb_4bit_use_double_quant', 'split_dataset_ratio', 'model_name', 'model_author',
             'train_type', 'tuner_backend'
         ]
-        skip_keys = ['output_dir', 'deepspeed']
+        skip_keys = [
+            'output_dir',
+            'deepspeed',
+            'temperature',
+            'max_new_tokens',
+        ]
         for key in all_keys:
             if key in skip_keys:
                 continue

diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py
@@ -46,13 +46,12 @@ class DataArguments:
     model_author: List[str] = field(
         default_factory=lambda: [None, None], metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
 
-    custom_dataset_info: Optional[str] = None  # .json
+    custom_dataset_info: List[str] = field(default_factory=list)  # .json
 
     def _init_custom_dataset_info(self):
         """register custom dataset_info.json to datasets"""
-        if self.custom_dataset_info is None:
-            return
-        register_dataset_info(self.custom_dataset_info)
+        for path in self.custom_dataset_info:
+            register_dataset_info(path)
 
     def __post_init__(self):
         if self.data_seed is None:

diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
@@ -119,13 +119,13 @@ def _init_model_info(self) -> torch.dtype:
         return self.model_info.torch_dtype
 
     def __post_init__(self):
+        if self.model is None:
+            raise ValueError(f'Please set --model <model_id_or_path>`, model: {self.model}')
         self.model_suffix = get_model_name(self.model)
         self._init_device_map()
         self._init_torch_dtype()
 
     def get_model_kwargs(self):
-        if self.model is None:
-            raise ValueError('Please set --model <model_id_or_path>`')
         return {
             'model_id_or_path': self.model,
             'torch_dtype': self.torch_dtype,

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -96,7 +96,7 @@ class TrainArguments(TorchAccArguments, TunerArguments, Seq2SeqTrainingOverrideA
         num_labels (Optional[int]): Number of labels for classification tasks. Default is None.
         packing (bool): Flag to enable packing of datasets. Default is False.
         lazy_tokenize (Optional[bool]): Flag to enable lazy tokenization. Default is None.
-        acc_strategy (Literal['token', 'sentence']): Strategy for accumulation. Default is 'token'.
+        acc_strategy (Literal['token', 'seq']): Strategy for accumulation. Default is 'token'.
         max_new_tokens (int): Maximum number of new tokens to generate. Default is 64.
         temperature (float): Temperature for sampling. Default is 0.
         optimizer (Optional[str]): Optimizer type to use, define it in the plugin package. Default is None.
@@ -113,7 +113,7 @@ class TrainArguments(TorchAccArguments, TunerArguments, Seq2SeqTrainingOverrideA
     lazy_tokenize: Optional[bool] = None
 
     # extra
-    acc_strategy: Literal['token', 'sentence'] = 'token'
+    acc_strategy: Literal['token', 'seq'] = 'token'
     max_new_tokens: int = 64
     temperature: float = 0.
     optimizer: Optional[str] = None
@@ -139,6 +139,10 @@ def __post_init__(self) -> None:
         self._init_deepspeed()
         self._init_device()
 
+        if self.streaming and self.lazy_tokenize:
+            self.lazy_tokenize = False
+            logger.warning('Streaming and lazy_tokenize are incompatible. '
+                           f'Setting args.lazy_tokenize: {self.lazy_tokenize}.')
         if self.lazy_tokenize is None:
             self.lazy_tokenize = self.model_meta.is_multimodal and not self.streaming
             logger.info(f'Setting args.lazy_tokenize: {self.lazy_tokenize}')

diff --git a/swift/llm/infer/deploy.py b/swift/llm/infer/deploy.py
@@ -63,12 +63,16 @@ def lifespan(self, app: FastAPI):
         finally:
             self._compute_infer_stats()
 
-    async def get_available_models(self):
+    def _get_model_list(self):
         args = self.args
         model_list = [args.served_model_name or args.model_suffix]
         if args.lora_request_list is not None:
             model_list += [lora_request.lora_name for lora_request in args.lora_request_list]
-        data = [Model(id=model_id, owned_by=args.owned_by) for model_id in model_list]
+        return model_list
+
+    async def get_available_models(self):
+        model_list = self._get_model_list()
+        data = [Model(id=model_id, owned_by=self.args.owned_by) for model_id in model_list]
         return ModelList(data=data)
 
     async def _check_model(self, request: ChatCompletionRequest) -> Optional[str]:
@@ -156,6 +160,7 @@ async def create_completion(self, request: CompletionRequest, raw_request: Reque
 
     def run(self):
         args = self.args
+        logger.info(f'model_list: {self._get_model_list()}')
         uvicorn.run(
             self.app, host=args.host, port=args.port, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile)
 

diff --git a/swift/llm/infer/infer.py b/swift/llm/infer/infer.py
@@ -73,7 +73,7 @@ def get_infer_engine(args: InferArguments, **kwargs):
     def get_template(args, processor: Processor) -> Template:
         template_kwargs = args.get_template_kwargs()
         template = get_template(args.template, processor, use_chat_template=args.use_chat_template, **template_kwargs)
-        logger.info(f'default_system: {template.default_system}')
+        logger.info(f'default_system: {template.template_meta.default_system}')
         return template
 
     def main(self):

diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -79,10 +79,9 @@ def __init__(
         else:
             template_meta = deepcopy(template_meta)
         # if default_system is None. not change self.default_system
+        template_meta.check_system(default_system)
         if default_system is not None:
-            self.default_system = template_meta.check_system(default_system)
-        else:
-            self.default_system = template_meta.default_system
+            template_meta.default_system = default_system
 
         template_meta.init(tokenizer)
 
@@ -519,7 +518,7 @@ def _get_std_messages(messages):
 
     def _jinja_encode(self, inputs: StdTemplateInputs):
         messages = inputs.messages.copy()
-        if inputs.system:
+        if inputs.system is not None:
             messages.insert(0, {'role': 'system', 'content': inputs.system})
         if messages[-1]['content'] is None:
             messages.pop()
@@ -530,9 +529,9 @@ def _jinja_encode(self, inputs: StdTemplateInputs):
     def _swift_encode(self, inputs: StdTemplateInputs):
         template_meta = self.template_meta
         system = inputs.system
+        template_meta.check_system(system)
         if system is None:
-            system = self.default_system
-        system = template_meta.check_system(system)
+            system = template_meta.default_system
 
         res_context_list: List[Context] = []
         res_context_types: List[ContextType] = []
@@ -541,7 +540,7 @@ def _swift_encode(self, inputs: StdTemplateInputs):
             res_context_list.append(bos_tokens)
             res_context_types.append(ContextType.OTHER)
 
-        prefix = template_meta.system_prefix if system else template_meta.prefix
+        prefix = template_meta.prefix if system is None else template_meta.system_prefix
         self._concat_context_list(prefix, res_context_list, res_context_types, system=system)
 
         n_round = len(inputs.messages) // 2

diff --git a/swift/llm/template/template_meta.py b/swift/llm/template/template_meta.py
@@ -94,7 +94,7 @@ def __post_init__(self):
             self.support_system = False
         else:
             self.support_system = True
-        self.default_system = self.check_system(self.default_system)
+        self.check_system(self.default_system)
 
         self.support_multi_round = self.chat_sep is not None
         if self.tool_prompt is None:
@@ -130,9 +130,7 @@ def init(self, tokenizer: PreTrainedTokenizerBase) -> None:
         if tokenizer.eos_token not in self.stop_words:
             self.stop_words.append(tokenizer.eos_token)
 
-    def check_system(self, system: str) -> str:
-        if system is None:
-            system = ''
-        if system:
-            assert self.support_system, f'The template does not support `system`, template_type: {self.template_type}'
-        return system
+    def check_system(self, system: Optional[str]) -> None:
+        if system is not None:
+            assert self.support_system, (
+                f'The template does not support `system`, template_type: {self.template_type}, system: {system}')
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
@@ -115,7 +115,7 @@ def _prepare_template(self, use_chat_template: bool) -> None:
         args = self.args
         template_kwargs = args.get_template_kwargs()
         template = get_template(args.template, self.processor, use_chat_template=use_chat_template, **template_kwargs)
-        logger.info(f'default_system: {template.default_system}')
+        logger.info(f'default_system: {template.template_meta.default_system}')
         if template.use_model:
             template.model = self.model
         self.template = template