41 lines
1.2 KiB
Bash
41 lines
1.2 KiB
Bash
#!/bin/bash
|
|
|
|
MODEL_PATH="/model/BitCPM-CANN-1B-unquantized"
|
|
DATA_PATH="/dataset/HuggingFaceH4_ultrachat_200k/data/train_sft-00000-of-00003-a3ecf92756993583.parquet"
|
|
OUTPUT_DIR="./output_sft"
|
|
DS_CONFIG="./ds_config.json"
|
|
|
|
NUM_GPUS=8
|
|
BATCH_SIZE_PER_GPU=2
|
|
GRAD_ACCUM_STEPS=1
|
|
MAX_SEQ_LENGTH=8192
|
|
|
|
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|
export DS_SKIP_CUDA_CHECK=1
|
|
|
|
torchrun --nproc_per_node=$NUM_GPUS train_sft.py \
|
|
--model_name_or_path $MODEL_PATH \
|
|
--data_path $DATA_PATH \
|
|
--max_seq_length $MAX_SEQ_LENGTH \
|
|
--output_dir $OUTPUT_DIR \
|
|
--per_device_train_batch_size $BATCH_SIZE_PER_GPU \
|
|
--gradient_accumulation_steps $GRAD_ACCUM_STEPS \
|
|
--max_steps 100 \
|
|
--learning_rate 2e-5 \
|
|
--lr_scheduler_type cosine \
|
|
--warmup_ratio 0.2 \
|
|
--weight_decay 0.0 \
|
|
--logging_steps 2 \
|
|
--save_steps 500 \
|
|
--save_total_limit 3 \
|
|
--bf16 \
|
|
--deepspeed $DS_CONFIG \
|
|
--gradient_checkpointing \
|
|
--seed 42 \
|
|
--dataloader_num_workers 4 \
|
|
--report_to tensorboard \
|
|
--logging_dir /data/tensorboard/sft \
|
|
--train_on_prompt false \
|
|
--gradient_checkpointing_kwargs '{"use_reentrant": false}'
|