You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
您好,工程里面给的序列并行训练脚本,看执行模式是用的ddp,请问当前的sequence_parallel支持device_map以及多机模式吗,有没有关于这块基本原理的说明啊
另外试了下启动脚本,会报如下错误
[rank1]: Traceback (most recent call last):
[rank1]: File "/load/ms-swift/swift/cli/sft.py", line 10, in
[rank1]: sft_main()
[rank1]: File "/load/ms-swift/swift/llm/train/sft.py", line 272, in sft_main
[rank1]: return SwiftSft(args).main()
[rank1]: File "/load/ms-swift/swift/llm/base.py", line 45, in main
[rank1]: result = self.run()
[rank1]: File "/load/ms-swift/swift/llm/train/sft.py", line 119, in run
[rank1]: self.model = self.prepare_model(self.args, self.model, template=self.template, train_dataset=train_dataset)
[rank1]: File "/load/ms-swift/swift/llm/train/tuner.py", line 409, in prepare_model
[rank1]: dispatch_module_xtuner(model)
[rank1]: File "/load/ms-swift/swift/trainers/xtuner.py", line 45, in dispatch_module_xtuner
[rank1]: from xtuner.model.modules.dispatch import dispatch_modules
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/init.py", line 2, in
[rank1]: from .internvl import InternVL_V1_5
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/internvl.py", line 8, in
[rank1]: from mmengine.model import BaseModel
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/init.py", line 6, in
[rank1]: from .base_model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/init.py", line 2, in
[rank1]: from .base_model import BaseModel
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 9, in
[rank1]: from mmengine.optim import OptimWrapper
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/init.py", line 2, in
[rank1]: from .optimizer import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/init.py", line 5, in
[rank1]: from .builder import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 174, in
[rank1]: TRANSFORMERS_OPTIMIZERS = register_transformers_optimizers()
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 169, in register_transformers_optimizers
[rank1]: OPTIMIZERS.register_module(name='Adafactor', module=Adafactor)
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 661, in register_module
[rank1]: self._register_module(module=module, module_name=name, force=force)
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 611, in _register_module
[rank1]: raise KeyError(f'{name} is already registered in {self.name} '
[rank1]: KeyError: 'Adafactor is already registered in optimizer at torch.optim'
[rank0]: Traceback (most recent call last):
[rank0]: File "/load/ms-swift/swift/cli/sft.py", line 10, in
[rank0]: sft_main()
[rank0]: File "/load/ms-swift/swift/llm/train/sft.py", line 272, in sft_main
[rank0]: return SwiftSft(args).main()
[rank0]: File "/load/ms-swift/swift/llm/base.py", line 45, in main
[rank0]: result = self.run()
[rank0]: File "/load/ms-swift/swift/llm/train/sft.py", line 119, in run
[rank0]: self.model = self.prepare_model(self.args, self.model, template=self.template, train_dataset=train_dataset)
[rank0]: File "/load/ms-swift/swift/llm/train/tuner.py", line 409, in prepare_model
[rank0]: dispatch_module_xtuner(model)
[rank0]: File "/load/ms-swift/swift/trainers/xtuner.py", line 45, in dispatch_module_xtuner
[rank0]: from xtuner.model.modules.dispatch import dispatch_modules
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/init.py", line 2, in
[rank0]: from .internvl import InternVL_V1_5
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/internvl.py", line 8, in
[rank0]: from mmengine.model import BaseModel
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/init.py", line 6, in
[rank0]: from .base_model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/init.py", line 2, in
[rank0]: from .base_model import BaseModel
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 9, in
[rank0]: from mmengine.optim import OptimWrapper
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/init.py", line 2, in
[rank0]: from .optimizer import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/init.py", line 5, in
[rank0]: from .builder import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 174, in
[rank0]: TRANSFORMERS_OPTIMIZERS = register_transformers_optimizers()
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 169, in register_transformers_optimizers
[rank0]: OPTIMIZERS.register_module(name='Adafactor', module=Adafactor)
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 661, in register_module
[rank0]: self._register_module(module=module, module_name=name, force=force)
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 611, in _register_module
[rank0]: raise KeyError(f'{name} is already registered in {self.name} '
[rank0]: KeyError: 'Adafactor is already registered in optimizer at torch.optim'
您好,工程里面给的序列并行训练脚本,看执行模式是用的ddp,请问当前的sequence_parallel支持device_map以及多机模式吗,有没有关于这块基本原理的说明啊
另外试了下启动脚本,会报如下错误
[rank1]: Traceback (most recent call last):
[rank1]: File "/load/ms-swift/swift/cli/sft.py", line 10, in
[rank1]: sft_main()
[rank1]: File "/load/ms-swift/swift/llm/train/sft.py", line 272, in sft_main
[rank1]: return SwiftSft(args).main()
[rank1]: File "/load/ms-swift/swift/llm/base.py", line 45, in main
[rank1]: result = self.run()
[rank1]: File "/load/ms-swift/swift/llm/train/sft.py", line 119, in run
[rank1]: self.model = self.prepare_model(self.args, self.model, template=self.template, train_dataset=train_dataset)
[rank1]: File "/load/ms-swift/swift/llm/train/tuner.py", line 409, in prepare_model
[rank1]: dispatch_module_xtuner(model)
[rank1]: File "/load/ms-swift/swift/trainers/xtuner.py", line 45, in dispatch_module_xtuner
[rank1]: from xtuner.model.modules.dispatch import dispatch_modules
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/init.py", line 2, in
[rank1]: from .internvl import InternVL_V1_5
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/internvl.py", line 8, in
[rank1]: from mmengine.model import BaseModel
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/init.py", line 6, in
[rank1]: from .base_model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/init.py", line 2, in
[rank1]: from .base_model import BaseModel
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 9, in
[rank1]: from mmengine.optim import OptimWrapper
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/init.py", line 2, in
[rank1]: from .optimizer import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/init.py", line 5, in
[rank1]: from .builder import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 174, in
[rank1]: TRANSFORMERS_OPTIMIZERS = register_transformers_optimizers()
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 169, in register_transformers_optimizers
[rank1]: OPTIMIZERS.register_module(name='Adafactor', module=Adafactor)
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 661, in register_module
[rank1]: self._register_module(module=module, module_name=name, force=force)
[rank1]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 611, in _register_module
[rank1]: raise KeyError(f'{name} is already registered in {self.name} '
[rank1]: KeyError: 'Adafactor is already registered in optimizer at torch.optim'
[rank0]: Traceback (most recent call last):
[rank0]: File "/load/ms-swift/swift/cli/sft.py", line 10, in
[rank0]: sft_main()
[rank0]: File "/load/ms-swift/swift/llm/train/sft.py", line 272, in sft_main
[rank0]: return SwiftSft(args).main()
[rank0]: File "/load/ms-swift/swift/llm/base.py", line 45, in main
[rank0]: result = self.run()
[rank0]: File "/load/ms-swift/swift/llm/train/sft.py", line 119, in run
[rank0]: self.model = self.prepare_model(self.args, self.model, template=self.template, train_dataset=train_dataset)
[rank0]: File "/load/ms-swift/swift/llm/train/tuner.py", line 409, in prepare_model
[rank0]: dispatch_module_xtuner(model)
[rank0]: File "/load/ms-swift/swift/trainers/xtuner.py", line 45, in dispatch_module_xtuner
[rank0]: from xtuner.model.modules.dispatch import dispatch_modules
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/init.py", line 2, in
[rank0]: from .internvl import InternVL_V1_5
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/xtuner/model/internvl.py", line 8, in
[rank0]: from mmengine.model import BaseModel
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/init.py", line 6, in
[rank0]: from .base_model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/init.py", line 2, in
[rank0]: from .base_model import BaseModel
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 9, in
[rank0]: from mmengine.optim import OptimWrapper
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/init.py", line 2, in
[rank0]: from .optimizer import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/init.py", line 5, in
[rank0]: from .builder import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 174, in
[rank0]: TRANSFORMERS_OPTIMIZERS = register_transformers_optimizers()
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/optim/optimizer/builder.py", line 169, in register_transformers_optimizers
[rank0]: OPTIMIZERS.register_module(name='Adafactor', module=Adafactor)
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 661, in register_module
[rank0]: self._register_module(module=module, module_name=name, force=force)
[rank0]: File "/root/miniforge3/envs/ms_swift/lib/python3.10/site-packages/mmengine/registry/registry.py", line 611, in _register_module
[rank0]: raise KeyError(f'{name} is already registered in {self.name} '
[rank0]: KeyError: 'Adafactor is already registered in optimizer at torch.optim'
环境如下
Package Version Editable project location
absl-py 2.1.0
accelerate 1.2.1
addict 2.4.0
aiofiles 23.2.1
aiohappyeyeballs 2.4.4
aiohttp 3.11.11
aiosignal 1.3.2
aliyun-python-sdk-core 2.16.0
aliyun-python-sdk-kms 2.16.5
altair 5.5.0
annotated-types 0.7.0
antlr4-python3-runtime 4.9.3
anyio 4.7.0
appdirs 1.4.4
argon2-cffi 23.1.0
argon2-cffi-bindings 21.2.0
arrow 1.3.0
arxiv 2.1.3
asttokens 3.0.0
async-lru 2.0.4
async-timeout 4.0.3
attrdict 2.0.1
attrs 24.3.0
babel 2.16.0
beautifulsoup4 4.12.3
binpacking 1.5.2
bitsandbytes 0.45.0
bleach 6.2.0
blinker 1.9.0
braceexpand 0.1.7
Brotli 1.1.0
cachetools 5.5.0
certifi 2024.12.14
cffi 1.17.1
charset-normalizer 3.4.0
click 8.1.8
colorama 0.4.6
comm 0.2.2
contourpy 1.3.1
cpm-kernels 1.0.11
crcmod 1.7
cryptography 44.0.0
cycler 0.12.1
dacite 1.8.1
dataclasses-json 0.6.7
datasets 3.0.1
debugpy 1.8.11
decorator 5.1.1
decord 0.6.0
deepspeed 0.16.2
defusedxml 0.7.1
dill 0.3.8
distro 1.9.0
docker-pycreds 0.4.0
docstring_parser 0.16
duckduckgo_search 5.3.1b1
editdistance 0.8.1
einops 0.8.0
et_xmlfile 2.0.0
eval_type_backport 0.2.2
evalscope 0.8.1
evaluate 0.4.3
exceptiongroup 1.2.2
executing 2.1.0
fastapi 0.115.6
fastjsonschema 2.21.1
feedparser 6.0.11
ffmpy 0.5.0
filelock 3.16.1
fire 0.7.0
flash_attn 2.7.2.post1
fonttools 4.55.3
fqdn 1.5.1
frozenlist 1.5.0
fsspec 2024.6.1
func_timeout 4.3.5
future 1.0.0
fuzzywuzzy 0.18.0
gitdb 4.0.11
GitPython 3.1.43
gradio 5.9.1
gradio_client 1.5.2
greenlet 3.1.1
griffe 0.49.0
grpcio 1.68.1
h11 0.14.0
h2 4.1.0
h5py 3.12.1
hjson 3.1.0
hpack 4.0.0
httpcore 1.0.7
httpx 0.28.1
httpx-sse 0.4.0
huggingface-hub 0.27.0
human-eval 1.0.3
hyperframe 6.0.1
idna 3.10
imageio 2.36.1
immutabledict 4.2.1
importlib_metadata 8.5.0
ipykernel 6.29.5
ipython 8.31.0
ipywidgets 8.1.5
isoduration 20.11.0
jedi 0.19.2
jieba 0.42.1
Jinja2 3.1.5
jiter 0.8.2
jmespath 0.10.0
joblib 1.4.2
json5 0.10.0
jsonlines 4.0.0
jsonpatch 1.33
jsonpointer 3.0.0
jsonschema 4.23.0
jsonschema-specifications 2024.10.1
jupyter 1.1.1
jupyter_client 8.6.3
jupyter-console 6.6.3
jupyter_core 5.7.2
jupyter-events 0.11.0
jupyter-lsp 2.2.5
jupyter_server 2.15.0
jupyter_server_terminals 0.5.3
jupyterlab 4.3.4
jupyterlab_pygments 0.3.0
jupyterlab_server 2.27.3
jupyterlab_widgets 3.0.13
kiwisolver 1.4.7
lagent 0.2.4
langchain 0.3.13
langchain-community 0.3.13
langchain-core 0.3.28
langchain-openai 0.2.14
langchain-text-splitters 0.3.4
langsmith 0.2.4
lazy_loader 0.4
Levenshtein 0.26.1
lxml 5.3.0
Markdown 3.7
markdown-it-py 3.0.0
MarkupSafe 2.1.5
marshmallow 3.23.2
matplotlib 3.10.0
matplotlib-inline 0.1.7
mdurl 0.1.2
mistune 3.0.2
mmengine 0.10.5
mmengine-lite 0.10.5
modelscope 1.21.0
mpmath 1.3.0
ms-opencompass 0.1.5
ms-swift 3.1.0.dev0 /load/ms-swift
ms-vlmeval 0.0.12
msgpack 1.1.0
mteb 1.19.4
multidict 6.1.0
multiprocess 0.70.16
mypy-extensions 1.0.0
narwhals 1.19.1
nbclient 0.10.2
nbconvert 7.16.4
nbformat 5.10.4
nest-asyncio 1.6.0
networkx 3.4.2
ninja 1.11.1.3
nltk 3.9.1
notebook 7.3.2
notebook_shim 0.2.4
numpy 1.26.4
nvidia-cublas-cu12 12.4.5.8
nvidia-cuda-cupti-cu12 12.4.127
nvidia-cuda-nvrtc-cu12 12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12 9.1.0.70
nvidia-cufft-cu12 11.2.1.3
nvidia-curand-cu12 10.3.5.147
nvidia-cusolver-cu12 11.6.1.9
nvidia-cusparse-cu12 12.3.1.170
nvidia-ml-py 12.560.30
nvidia-nccl-cu12 2.21.5
nvidia-nvjitlink-cu12 12.4.127
nvidia-nvtx-cu12 12.4.127
omegaconf 2.3.0
openai 1.58.1
OpenCC 1.1.9
opencv-python 4.10.0.84
openpyxl 3.1.5
orjson 3.10.12
oss2 2.19.1
overrides 7.7.0
packaging 24.2
pandas 2.2.3
pandocfilters 1.5.1
parso 0.8.4
peft 0.14.0
pexpect 4.9.0
phx-class-registry 4.1.0
pillow 11.0.0
pip 24.3.1
platformdirs 4.3.6
plotly 5.24.1
ply 3.11
polars 1.17.1
portalocker 3.0.0
prettytable 3.12.0
prometheus_client 0.21.1
prompt_toolkit 3.0.48
propcache 0.2.1
protobuf 5.29.2
psutil 6.1.1
ptyprocess 0.7.0
pure_eval 0.2.3
py-cpuinfo 9.0.0
pyarrow 18.1.0
pycparser 2.22
pycryptodome 3.21.0
pydantic 2.10.4
pydantic_core 2.27.2
pydantic-settings 2.7.0
pydeck 0.9.1
pydub 0.25.1
Pygments 2.18.0
Pympler 1.1
pynvml 12.0.0
pyparsing 3.2.0
pypinyin 0.53.0
pysbd 0.3.4
python-dateutil 2.9.0.post0
python-dotenv 1.0.1
python-json-logger 3.2.1
python-Levenshtein 0.26.1
python-multipart 0.0.20
pytrec-eval-terrier 0.5.6
pytz 2024.2
PyYAML 6.0.2
pyzmq 26.2.0
ragas 0.2.7
rank-bm25 0.2.2
RapidFuzz 3.11.0
referencing 0.35.1
regex 2024.11.6
requests 2.32.3
requests-toolbelt 1.0.0
rfc3339-validator 0.1.4
rfc3986-validator 0.1.1
rich 13.9.4
rouge 1.0.1
rouge-chinese 1.0.3
rouge_score 0.1.2
rpds-py 0.22.3
ruff 0.8.4
sacrebleu 2.4.3
safehttpx 0.1.6
safetensors 0.4.5
scikit-image 0.25.0
scikit-learn 1.6.0
scipy 1.14.1
seaborn 0.13.2
semantic-version 2.10.0
Send2Trash 1.8.3
sentence-transformers 3.3.1
sentencepiece 0.2.0
sentry-sdk 2.19.2
setproctitle 1.3.4
setuptools 69.5.1
sgmllib3k 1.0.0
shellingham 1.5.4
shtab 1.7.1
simple-ddl-parser 1.7.1
simplejson 3.19.3
six 1.17.0
smmap 5.0.1
sniffio 1.3.1
socksio 1.0.0
sortedcontainers 2.4.0
soupsieve 2.6
SQLAlchemy 2.0.36
sse-starlette 2.2.0
stack-data 0.6.3
starlette 0.41.3
streamlit 1.41.1
sty 1.0.6
sympy 1.13.1
tabulate 0.9.0
tenacity 9.0.0
tensorboard 2.18.0
tensorboard-data-server 0.7.2
termcolor 2.5.0
terminado 0.18.1
threadpoolctl 3.5.0
tifffile 2024.12.12
tiktoken 0.8.0
timeout-decorator 0.5.0
tinycss2 1.4.0
tokenizers 0.21.0
toml 0.10.2
tomli 2.2.1
tomlkit 0.13.2
torch 2.5.1
torchvision 0.20.1
tornado 6.4.2
tqdm 4.67.1
traitlets 5.14.3
transformers 4.47.1
transformers-stream-generator 0.0.5
triton 3.1.0
trl 0.11.4
typeguard 4.4.1
typer 0.15.1
types-python-dateutil 2.9.0.20241206
typing_extensions 4.12.2
typing-inspect 0.9.0
tyro 0.9.4
tzdata 2024.2
unicorn 2.1.1
uri-template 1.3.0
urllib3 2.3.0
uvicorn 0.34.0
validators 0.34.0
wandb 0.19.1
watchdog 6.0.0
wcwidth 0.2.13
webcolors 24.11.1
webdataset 0.2.100
webencodings 0.5.1
websocket-client 1.8.0
websockets 14.1
Werkzeug 3.1.3
wheel 0.45.1
widgetsnbextension 4.0.13
XlsxWriter 3.2.0
xtuner 0.1.23
xxhash 3.5.0
yapf 0.43.0
yarl 1.18.3
zipp 3.21.0
The text was updated successfully, but these errors were encountered: