Qwen2.5-3B-ReMax-math-reaso…/training_config.yaml

finetune:
  data: null
  model_class: causal-language-modeling
  config_name: ${..model_path}
  optim: adamw_torch
  load_as_bf16: true
  fp32_lm_head: ${..fp32_lm_head}
  fp32_layer_prefix: ${..fp32_layer_prefix}
  use_flash_attention: true
  attn_implementation: flash_attention_2
  auto_device_map: false
  lora:
    enabled: false
    task_type: CAUSAL_LM
    base_model_8bit: false
    base_model_4bit: false
    r: 16
    alpha: 16
    dropout: 0.05
    bias: none
    target_modules: []
  force_restart: ${..force_restart}
  resume_dataloader: false
  train_batch_size: 2
  valid_batch_size: 4
  weight_decay: 0.01
  learning_rate: 1.0e-06
  gradient_clipping_threshold: 0.3
  lr_scheduler_type: cosine
  num_warmup_steps: 25
  gradient_accumulation_passes: 128
  gradient_checkpointing: true
  reentrant_checkpointing: false
  max_train_steps: 1500
  interrupt_train_steps: -1
  max_eval_steps: -1
  seq_length: 8192
  seq_packing: true
  output_dir: ${..output_dir}/finetune
  seed: ${..seed}
  save_checkpoint_steps: 100
  keep_intermediate_checkpoints: true
  trust_remote_code: false
  cuda_empty_cache: true
  sft_config_name: null
  n_examples: 0
  log_each_n_steps: 1
  also_save_steps: []
  use_safetensors: true
  save_final_training_state: true
  seq_parallel: 1
  objective: rl
  input: training_data
  send_weight_updates: true
  queue_size: 32
  max_lag: null
  weight_update_interval: 1
  pop_old_data: ${..pop_old_data}
  attempts: 8
  eval_callback:
    _target_: pipelinerl.finetune.utils.dummy_eval_callback
    config_name: ''
  rl:
    policy_loss: ppo
    divide_advantage_by_std: false
    kl_coef: 0.0
    final_kl_coef: 0.0
    entropy_bonus: 0.0
    reward_minus_kl_coef: 0.0
    epsilon_low: 0.2
    epsilon_high: 0.2
    use_advantages: true
    relu_log_p_weights: false
    clamp_log_ratio_ref_new_value: 5
    temperature: ${...llm.parameters.temperature}
    aggregate_loss: sum
    overlong_filtering: false
    adv_estimator: remax
    gamma: 1.0
    filter_zero_advantage_groups: false
rewards:
  correct_answer_finished: 1.0
  correct_answer_not_finished: 1.0
  wrong_answer_finished: 0
  wrong_answer_not_finished: 0
  no_answer_finished: 0
  no_answer_not_finished: 0
  unparsable_finished: 0
  unparsable_not_finished: 0
streams:
  backend: files
seed: 42
fp32_lm_head: false
fp32_layer_prefix: lm_head
actor:
  log_each_n_secs: 0
  llm_max_rollouts: 256
  rollout_workers: 1
  discount_factor: 1
  problem_queue_size: 256
  result_queue_size: 256
  throughput_window_size: 50
  shared_memory_entry_size: 10000000
  rollout_policy: pipelinerl.domains.math.generate_math_rollout
  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
  task_template: '{task}'
  task_prompt: ''
environment: null
preprocess:
  input: actor
  output: training_data
  n_workers: 8
  chunk_n_groups: 2
  raw_queue_size: 8
  input_queue_size: 32
  output_queue_size: 32
  dataset_buffer_size: 0
  ring_buffer_size: 128
  max_ready_samples_per_lead: 64
  pop_old_data: ${..pop_old_data}
  shared_memory_entry_size: 100000000
  log_every_n_samples: 128
llm:
  parameters:
    max_tokens: 4096
    temperature: 1.0
test_llm:
  parameters:
    max_tokens: 4096
    temperature: 1.0
    top_p: 0.95
    top_k: 50
vllm_config:
  use_v1: false
  quantization: null
  vllm_kwargs:
    dtype: bfloat16
    gpu-memory-utilization: 0.92
    max-num-seqs: 64
    max-num-batched-tokens: 16384
    enable-chunked-prefill: ''
    return-tokens-as-token-ids: ''
    tensor-parallel-size: 1
    pipeline-parallel-size: 1
    generation-config: vllm
    max_model_len: 8192
    num-scheduler-steps: 8
    disable-log-requests: ''
    disable-frontend-multiprocessing: ''
world:
  replicas: 1
  actor_fraction: 6
  preprocessor_fraction: 0
  finetune_fraction: 2
  env_replicas: 1
  actor_group_port: 9000
  environment_start_port: 7777
jobs:
- kind: actor_llm
  idx: 0
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: 8080
  gpus:
  - 0
  url: http://localhost:8080
  environment_key: null
  environment_index: null
- kind: actor_llm
  idx: 1
  replica_idx: 1
  local_idx: 1
  node_rank: 0
  hostname: localhost
  port: 8081
  gpus:
  - 1
  url: http://localhost:8081
  environment_key: null
  environment_index: null
- kind: actor_llm
  idx: 2
  replica_idx: 2
  local_idx: 2
  node_rank: 0
  hostname: localhost
  port: 8082
  gpus:
  - 2
  url: http://localhost:8082
  environment_key: null
  environment_index: null
- kind: actor
  idx: 3
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: null
  gpus: []
  url: ''
  environment_key: null
  environment_index: null
- kind: preprocessor
  idx: 4
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: null
  gpus: []
  url: ''
  environment_key: null
  environment_index: null
- kind: environment
  idx: 5
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: 7777
  gpus: []
  url: ''
  environment_key: math
  environment_index: 0
- kind: finetune
  idx: 6
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: null
  gpus:
  - 3
  url: ''
  environment_key: null
  environment_index: null
eval_every_n_versions: 78000
model_path: Qwen/Qwen2.5-3B
accelerate_config: null
use_deepspeed: true
deepspeed_config: deepspeed_stage3_bf16
use_fsdp: false
fsdp:
  param_dtype: fp32
  reduce_dtype: fp32
  buffer_dtype: fp32
output_dir: results/qwen2.5_3b_remax_3a1f_4xh100_214753
force_restart: false
pop_old_data: true
max_lag: null
attempts: 16
train_subset: null
debug:
  mode: ''
  streams_from: null
  place_inference_workers: true
  use_existing_llms: false
me:
  job_idx: null
wandb:
  use_wandb: true
  fail_on_init_error: false
  init_timeout: 120
  wandb_id: null
  wandb_name: null
  wandb_entity_name: jaygala24-team
  wandb_project_name: rl-post-training
  wandb_resume: always
  wandb_use_basename: true
  wandb_workspace_root: results
  wandb_group: qwen2.5_3b_remax_3a1f_4xh100_214753
  wandb_dir: null
  tags: []
environments:
- key: math
  mode: remote
  _target_: pipelinerl.domains.math.MathEnvironment
environment_key: math
dataset_loader: pipelinerl.domains.math.load_datasets
train_dataset_names:
- gsm8k_train
- math_train
test_dataset_names:
- gsm8k_test
- math_500
初始化项目，由ModelHub XC社区提供模型 Model: jaygala24/Qwen2.5-3B-ReMax-math-reasoning Source: Original Platform 2026-04-22 01:41:44 +08:00			`finetune:`
			`data: null`
			`model_class: causal-language-modeling`
			`config_name: ${..model_path}`
			`optim: adamw_torch`
			`load_as_bf16: true`
			`fp32_lm_head: ${..fp32_lm_head}`
			`fp32_layer_prefix: ${..fp32_layer_prefix}`
			`use_flash_attention: true`
			`attn_implementation: flash_attention_2`
			`auto_device_map: false`
			`lora:`
			`enabled: false`
			`task_type: CAUSAL_LM`
			`base_model_8bit: false`
			`base_model_4bit: false`
			`r: 16`
			`alpha: 16`
			`dropout: 0.05`
			`bias: none`
			`target_modules: []`
			`force_restart: ${..force_restart}`
			`resume_dataloader: false`
			`train_batch_size: 2`
			`valid_batch_size: 4`
			`weight_decay: 0.01`
			`learning_rate: 1.0e-06`
			`gradient_clipping_threshold: 0.3`
			`lr_scheduler_type: cosine`
			`num_warmup_steps: 25`
			`gradient_accumulation_passes: 128`
			`gradient_checkpointing: true`
			`reentrant_checkpointing: false`
			`max_train_steps: 1500`
			`interrupt_train_steps: -1`
			`max_eval_steps: -1`
			`seq_length: 8192`
			`seq_packing: true`
			`output_dir: ${..output_dir}/finetune`
			`seed: ${..seed}`
			`save_checkpoint_steps: 100`
			`keep_intermediate_checkpoints: true`
			`trust_remote_code: false`
			`cuda_empty_cache: true`
			`sft_config_name: null`
			`n_examples: 0`
			`log_each_n_steps: 1`
			`also_save_steps: []`
			`use_safetensors: true`
			`save_final_training_state: true`
			`seq_parallel: 1`
			`objective: rl`
			`input: training_data`
			`send_weight_updates: true`
			`queue_size: 32`
			`max_lag: null`
			`weight_update_interval: 1`
			`pop_old_data: ${..pop_old_data}`
			`attempts: 8`
			`eval_callback:`
			`_target_: pipelinerl.finetune.utils.dummy_eval_callback`
			`config_name: ''`
			`rl:`
			`policy_loss: ppo`
			`divide_advantage_by_std: false`
			`kl_coef: 0.0`
			`final_kl_coef: 0.0`
			`entropy_bonus: 0.0`
			`reward_minus_kl_coef: 0.0`
			`epsilon_low: 0.2`
			`epsilon_high: 0.2`
			`use_advantages: true`
			`relu_log_p_weights: false`
			`clamp_log_ratio_ref_new_value: 5`
			`temperature: ${...llm.parameters.temperature}`
			`aggregate_loss: sum`
			`overlong_filtering: false`
			`adv_estimator: remax`
			`gamma: 1.0`
			`filter_zero_advantage_groups: false`
			`rewards:`
			`correct_answer_finished: 1.0`
			`correct_answer_not_finished: 1.0`
			`wrong_answer_finished: 0`
			`wrong_answer_not_finished: 0`
			`no_answer_finished: 0`
			`no_answer_not_finished: 0`
			`unparsable_finished: 0`
			`unparsable_not_finished: 0`
			`streams:`
			`backend: files`
			`seed: 42`
			`fp32_lm_head: false`
			`fp32_layer_prefix: lm_head`
			`actor:`
			`log_each_n_secs: 0`
			`llm_max_rollouts: 256`
			`rollout_workers: 1`
			`discount_factor: 1`
			`problem_queue_size: 256`
			`result_queue_size: 256`
			`throughput_window_size: 50`
			`shared_memory_entry_size: 10000000`
			`rollout_policy: pipelinerl.domains.math.generate_math_rollout`
			`system_prompt: Please reason step by step, and put your final answer within \boxed{}.`
			`task_template: '{task}'`
			`task_prompt: ''`
			`environment: null`
			`preprocess:`
			`input: actor`
			`output: training_data`
			`n_workers: 8`
			`chunk_n_groups: 2`
			`raw_queue_size: 8`
			`input_queue_size: 32`
			`output_queue_size: 32`
			`dataset_buffer_size: 0`
			`ring_buffer_size: 128`
			`max_ready_samples_per_lead: 64`
			`pop_old_data: ${..pop_old_data}`
			`shared_memory_entry_size: 100000000`
			`log_every_n_samples: 128`
			`llm:`
			`parameters:`
			`max_tokens: 4096`
			`temperature: 1.0`
			`test_llm:`
			`parameters:`
			`max_tokens: 4096`
			`temperature: 1.0`
			`top_p: 0.95`
			`top_k: 50`
			`vllm_config:`
			`use_v1: false`
			`quantization: null`
			`vllm_kwargs:`
			`dtype: bfloat16`
			`gpu-memory-utilization: 0.92`
			`max-num-seqs: 64`
			`max-num-batched-tokens: 16384`
			`enable-chunked-prefill: ''`
			`return-tokens-as-token-ids: ''`
			`tensor-parallel-size: 1`
			`pipeline-parallel-size: 1`
			`generation-config: vllm`
			`max_model_len: 8192`
			`num-scheduler-steps: 8`
			`disable-log-requests: ''`
			`disable-frontend-multiprocessing: ''`
			`world:`
			`replicas: 1`
			`actor_fraction: 6`
			`preprocessor_fraction: 0`
			`finetune_fraction: 2`
			`env_replicas: 1`
			`actor_group_port: 9000`
			`environment_start_port: 7777`
			`jobs:`
			`- kind: actor_llm`
			`idx: 0`
			`replica_idx: 0`
			`local_idx: 0`
			`node_rank: 0`
			`hostname: localhost`
			`port: 8080`
			`gpus:`
			`- 0`
			`url: http://localhost:8080`
			`environment_key: null`
			`environment_index: null`
			`- kind: actor_llm`
			`idx: 1`
			`replica_idx: 1`
			`local_idx: 1`
			`node_rank: 0`
			`hostname: localhost`
			`port: 8081`
			`gpus:`
			`- 1`
			`url: http://localhost:8081`
			`environment_key: null`
			`environment_index: null`
			`- kind: actor_llm`
			`idx: 2`
			`replica_idx: 2`
			`local_idx: 2`
			`node_rank: 0`
			`hostname: localhost`
			`port: 8082`
			`gpus:`
			`- 2`
			`url: http://localhost:8082`
			`environment_key: null`
			`environment_index: null`
			`- kind: actor`
			`idx: 3`
			`replica_idx: 0`
			`local_idx: 0`
			`node_rank: 0`
			`hostname: localhost`
			`port: null`
			`gpus: []`
			`url: ''`
			`environment_key: null`
			`environment_index: null`
			`- kind: preprocessor`
			`idx: 4`
			`replica_idx: 0`
			`local_idx: 0`
			`node_rank: 0`
			`hostname: localhost`
			`port: null`
			`gpus: []`
			`url: ''`
			`environment_key: null`
			`environment_index: null`
			`- kind: environment`
			`idx: 5`
			`replica_idx: 0`
			`local_idx: 0`
			`node_rank: 0`
			`hostname: localhost`
			`port: 7777`
			`gpus: []`
			`url: ''`
			`environment_key: math`
			`environment_index: 0`
			`- kind: finetune`
			`idx: 6`
			`replica_idx: 0`
			`local_idx: 0`
			`node_rank: 0`
			`hostname: localhost`
			`port: null`
			`gpus:`
			`- 3`
			`url: ''`
			`environment_key: null`
			`environment_index: null`
			`eval_every_n_versions: 78000`
			`model_path: Qwen/Qwen2.5-3B`
			`accelerate_config: null`
			`use_deepspeed: true`
			`deepspeed_config: deepspeed_stage3_bf16`
			`use_fsdp: false`
			`fsdp:`
			`param_dtype: fp32`
			`reduce_dtype: fp32`
			`buffer_dtype: fp32`
			`output_dir: results/qwen2.5_3b_remax_3a1f_4xh100_214753`
			`force_restart: false`
			`pop_old_data: true`
			`max_lag: null`
			`attempts: 16`
			`train_subset: null`
			`debug:`
			`mode: ''`
			`streams_from: null`
			`place_inference_workers: true`
			`use_existing_llms: false`
			`me:`
			`job_idx: null`
			`wandb:`
			`use_wandb: true`
			`fail_on_init_error: false`
			`init_timeout: 120`
			`wandb_id: null`
			`wandb_name: null`
			`wandb_entity_name: jaygala24-team`
			`wandb_project_name: rl-post-training`
			`wandb_resume: always`
			`wandb_use_basename: true`
			`wandb_workspace_root: results`
			`wandb_group: qwen2.5_3b_remax_3a1f_4xh100_214753`
			`wandb_dir: null`
			`tags: []`
			`environments:`
			`- key: math`
			`mode: remote`
			`_target_: pipelinerl.domains.math.MathEnvironment`
			`environment_key: math`
			`dataset_loader: pipelinerl.domains.math.load_datasets`
			`train_dataset_names:`
			`- gsm8k_train`
			`- math_train`
			`test_dataset_names:`
			`- gsm8k_test`
			`- math_500`