lr=2e-4
lora_rank=8
lora_alpha=32
lora_trainable="q_proj,v_proj"
#lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
#modules_to_save="embed_tokens,lm_head"
lora_dropout=0.05
pretrained_model='/home/ps/workspace/llm/Chinese-LLaMA-Alpaca/output/merged_13b'
chinese_tokenizer_path='/home/ps/workspace/llm/Chinese-LLaMA-Alpaca/output/merged_13b'
dataset_dir='/home/ps/workspace/llm/Chinese-LLaMA-Alpaca/data'
data_cache='/home/ps/workspace/llm/Chinese-LLaMA-Alpaca/cache'
max_steps=5
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps=8
output_dir='/home/ps/workspace/llm/Chinese-LLaMA-Alpaca/output/law_13b'
deepspeed_config_file=ds_zero2_no_offload.json
torchrun --nnodes 1 --nproc_per_node 2 run_clm_pt_with_peft.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--tokenizer_name_or_path ${chinese_tokenizer_path} \
--dataset_dir ${dataset_dir} \
--data_cache_dir ${data_cache} \
--validation_split_percentage 0.001 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--do_train \
--seed $RANDOM \
--fp16 \
--num_train_epochs 1 \
--lr_scheduler_type cosine \
--learning_rate ${lr} \
--warmup_ratio 0.05 \
--weight_decay 0.01 \
--logging_strategy steps \
--logging_steps 10 \
--save_strategy steps \
--save_total_limit 3 \
--save_steps 200 \
--max_steps ${max_steps} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--preprocessing_num_workers 8 \
--block_size 512 \
--output_dir ${output_dir} \
--overwrite_output_dir \
--ddp_timeout 30000 \
--logging_first_step True \
--lora_rank ${lora_rank} \
--lora_alpha ${lora_alpha} \
--trainable ${lora_trainable} \
--lora_dropout ${lora_dropout} \
--torch_dtype float16
依赖情况(代码类问题务必提供)
No response
运行日志或截图
OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB (GPU 0; 23.65 GiB total capacity; 21.17 GiB already allocated; 73.25 MiB free; 21.17 GiB
reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory
Management and PYTORCH_CUDA_ALLOC_CONF
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 551195 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 551194) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/home/ps/.local/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/ps/.local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/ps/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/home/ps/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/ps/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ps/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run_clm_pt_with_peft.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
time : 2023-07-08_00:09:34
host : ps.ps
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 551194)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
@@ -535,6 +533,8 @@ def main():
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
torch_dtype=torch_dtype,
+ load_in_8bit=True,
+ device_map='auto',
low_cpu_mem_usage=True
else:
@@ -558,6 +558,7 @@ def main():
"- Continue pre-training Chinese Alpaca: 49954 / 49954 \n")
model.resize_token_embeddings(len(tokenizer))
if training_args.peft_path is not None:
logger.info("Peft from pre-trained model")
model = PeftModel.from_pretrained(model, training_args.peft_path)
@@ -581,11 +582,14 @@ def main():
modules_to_save=modules_to_save)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
old_state_dict = model.state_dict
model.state_dict = (
lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))
+ model = prepare_model_for_int8_training(model)
但是目前微调的时候遇到一个报错(优化器 数组越界 IndexError: list index out of range ),求指点一下可能什么原因?
│ /home/ps/.local/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:270 in │
│ __init__ │
│ │
│ 267 │ │ ), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {s │
│ 268 │ │ │
│ 269 │ │ self.all_reduce_print = False │
│ ❱ 270 │ │ **self.dtype = self.optimizer.param_groups[0]['params'][0].dtype** │
│ 271 │ │ │
│ 272 │ │ self.round_robin_bit16_groups = [] │
│ 273 │ │ self.round_robin_bit16_indices = [] │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
**IndexError: list index out of range**

