53 lines
1.5 KiB
YAML
53 lines
1.5 KiB
YAML
adam_beta2: 0.98
|
|
assistant_tag: assistant
|
|
attn: fa2
|
|
bf16: true
|
|
content_tag: content
|
|
cutoff_len: 32768
|
|
dataloader_num_workers: 4
|
|
dataloader_persistent_workers: true
|
|
dataloader_pin_memory: true
|
|
dataset: /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed
|
|
dataset_dir: ONLINE
|
|
datasets_cache_dir: /e/scratch/jureap59/raoof1/sft_data/arrow_cache
|
|
ddp_timeout: 180000000
|
|
deepspeed: sft/lf_configs/deepspeed/ds_z3_accelerate.json
|
|
do_train: true
|
|
enable_liger_kernel: true
|
|
finetuning_type: full
|
|
formatting: sharegpt
|
|
gradient_accumulation_steps: 1
|
|
gradient_checkpointing: true
|
|
hub_model_id: DCAgent/g1_gptlong_top8_32b
|
|
include_mfu: true
|
|
learning_rate: 4.0e-05
|
|
load_best_model_at_end: false
|
|
logging_steps: 5
|
|
logging_strategy: steps
|
|
lr_scheduler_type: cosine
|
|
max_grad_norm: 0.001
|
|
messages: conversations
|
|
model_name_or_path: /e/scratch/jureap59/raoof1/sft_data/hf_hub/models--Qwen--Qwen3-32B/snapshots/9216db5781bf21249d130ec9da846c4624c16137
|
|
num_train_epochs: 5.0
|
|
optim: adamw_torch_fused
|
|
output_dir: /e/scratch/jureap59/raoof1/sft_data/checkpoints/sft_g1_gptlong_top8_32b__Qwen3-32B
|
|
overwrite_cache: true
|
|
per_device_train_batch_size: 1
|
|
plot_loss: true
|
|
preprocessing_num_workers: 16
|
|
pure_bf16: false
|
|
push_to_hub: false
|
|
role_tag: role
|
|
run_name: g1_gptlong_top8_32b__Qwen3-8B
|
|
save_steps: 300
|
|
save_strategy: steps
|
|
save_total_limit: 1
|
|
seed: 42
|
|
stage: sft
|
|
template: qwen3
|
|
trust_remote_code: true
|
|
user_tag: user
|
|
warmup_ratio: 0.1
|
|
weight_decay: 0.04
|
|
disable_shuffling: true
|