You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
1%|▊ | 53/8790 [16:19<42:25:50, 17.48s/it][rank7]: Traceback (most recent call last):
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
[rank7]: launch()
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
[rank7]: run_exp()
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/train/tuner.py", line 93, in run_exp
[rank7]: _training_function(config={"args": args, "callbacks": callbacks})
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/train/tuner.py", line 67, in _training_function
[rank7]: run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 102, in run_sft
[rank7]: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 2241, in train
[rank7]: return inner_training_loop(
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop
[rank7]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 3698, in training_step
[rank7]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 3759, in compute_loss
[rank7]: outputs = model(**inputs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank7]: return self._call_impl(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
[rank7]: return forward_call(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
[rank7]: ret_val = func(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1899, in forward
[rank7]: loss = self.module(*inputs, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank7]: return self._call_impl(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1844, in _call_impl
[rank7]: return inner()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1790, in inner
[rank7]: result = forward_call(*args, **kwargs)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/liger_kernel/transformers/model/qwen2.py", line 183, in lce_forward
[rank7]: outputs = self.model(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank7]: return self._call_impl(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1844, in _call_impl
[rank7]: return inner()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1790, in inner
[rank7]: result = forward_call(*args, **kwargs)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 567, in forward
[rank7]: layer_outputs = self._gradient_checkpointing_func(
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py", line 97, in custom_gradient_checkpointing_func
[rank7]: return gradient_checkpointing_func(func, *args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 575, in apply
[rank7]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py", line 465, in decorate_fwd
[rank7]: return fwd(*args, **kwargs)
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py", line 55, in forward
[rank7]: saved_hidden_states = hidden_states.to("cpu", non_blocking=True)
[rank7]: RuntimeError: CUDA error: invalid argument
[rank7]: Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
W0305 14:37:46.855000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79312 closing signal SIGTERM
W0305 14:37:46.856000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79313 closing signal SIGTERM
W0305 14:37:46.857000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79314 closing signal SIGTERM
W0305 14:37:46.857000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79315 closing signal SIGTERM
W0305 14:37:46.858000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79316 closing signal SIGTERM
W0305 14:37:46.858000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79317 closing signal SIGTERM
W0305 14:37:46.859000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79318 closing signal SIGTERM
E0305 14:38:01.780000 79247 torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 7 (pid: 79319) of binary: /data/private/wusq97/ds/bin/python
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 919, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 910, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 138, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Reminder
System Info
单机 8张A800
CUDA12.4
Reproduction
参数配置
flash_attn: fa2
use_unsloth_gc: true
enable_liger_kernel: true
weight_decay: 0.0001
optim: adamw_torch
per_device_train_batch_size: 4
gradient_accumulation_steps: 1
gradient_checkpointing: true
learning_rate: 5.0e-05
num_train_epochs: 3
1%|▊ | 53/8790 [16:19<42:25:50, 17.48s/it][rank7]: Traceback (most recent call last):
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
[rank7]: launch()
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
[rank7]: run_exp()
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/train/tuner.py", line 93, in run_exp
[rank7]: _training_function(config={"args": args, "callbacks": callbacks})
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/train/tuner.py", line 67, in _training_function
[rank7]: run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 102, in run_sft
[rank7]: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 2241, in train
[rank7]: return inner_training_loop(
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop
[rank7]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 3698, in training_step
[rank7]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/trainer.py", line 3759, in compute_loss
[rank7]: outputs = model(**inputs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank7]: return self._call_impl(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
[rank7]: return forward_call(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
[rank7]: ret_val = func(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1899, in forward
[rank7]: loss = self.module(*inputs, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank7]: return self._call_impl(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1844, in _call_impl
[rank7]: return inner()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1790, in inner
[rank7]: result = forward_call(*args, **kwargs)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/liger_kernel/transformers/model/qwen2.py", line 183, in lce_forward
[rank7]: outputs = self.model(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank7]: return self._call_impl(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1844, in _call_impl
[rank7]: return inner()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1790, in inner
[rank7]: result = forward_call(*args, **kwargs)
[rank7]: File "/data/private/wusq97/ds/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 567, in forward
[rank7]: layer_outputs = self._gradient_checkpointing_func(
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py", line 97, in custom_gradient_checkpointing_func
[rank7]: return gradient_checkpointing_func(func, *args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 575, in apply
[rank7]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank7]: File "/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py", line 465, in decorate_fwd
[rank7]: return fwd(*args, **kwargs)
[rank7]: File "/data/private/wusq97/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py", line 55, in forward
[rank7]: saved_hidden_states = hidden_states.to("cpu", non_blocking=True)
[rank7]: RuntimeError: CUDA error: invalid argument
[rank7]: Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.W0305 14:37:46.855000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79312 closing signal SIGTERM
W0305 14:37:46.856000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79313 closing signal SIGTERM
W0305 14:37:46.857000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79314 closing signal SIGTERM
W0305 14:37:46.857000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79315 closing signal SIGTERM
W0305 14:37:46.858000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79316 closing signal SIGTERM
W0305 14:37:46.858000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79317 closing signal SIGTERM
W0305 14:37:46.859000 79247 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 79318 closing signal SIGTERM
E0305 14:38:01.780000 79247 torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 7 (pid: 79319) of binary: /data/private/wusq97/ds/bin/python
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 919, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 910, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 138, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/data/private/wusq97/LLaMA-Factory/src/llamafactory/launcher.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2025-03-05_14:37:46
host : mnbook-linsy38-deepseek-llm-2.mnbook-linsy38-deepseek-llm-nohead.aios.svc.cluster.local
rank : 7 (local_rank: 7)
exitcode : 1 (pid: 79319)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Others
No response
The text was updated successfully, but these errors were encountered: