llama-2-4b/configs/3b-pt.yml

base_model: winglian/Llama-2-3b-hf
base_model_config: winglian/Llama-2-3b-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: false
load_in_4bit: false
hub_model_id: winglian/llama-2-3b-cpt
strict: false
  # pretraining_dataset: togethercomputer/RedPajama-Data-1T-Sample
push_dataset_to_hub:
datasets:
  - path: togethercomputer/RedPajama-Data-1T-Sample
    type: completion
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
adapter:
lora_model_dir:
sequence_len: 4096
sample_packing: 
lora_r:
lora_alpha:
lora_dropout:
lora_target_modules:
lora_target_linear:
lora_fan_in_fan_out:
wandb_project: llama-2-3b-cpt
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:
output_dir: ./llama-2-3b-out
  # max_steps: 4400
gradient_accumulation_steps: 3
micro_batch_size: 8
num_epochs: 1
optimizer: adamw_torch
adam_beta2: 0.95
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0001
train_on_inputs: false
group_by_length: false
float16: true
bf16: truc
fp16: false
tf32: true
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 100
eval_steps: 
save_steps: 100
save_total_limit: 6
debug:
deepspeed:
weight_decay: 0.1
fsdp:
fsdp_config:
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
初始化项目，由ModelHub XC社区提供模型 Model: winglian/llama-2-4b Source: Original Platform 2026-05-03 10:04:40 +08:00			`base_model: winglian/Llama-2-3b-hf`
			`base_model_config: winglian/Llama-2-3b-hf`
			`model_type: LlamaForCausalLM`
			`tokenizer_type: LlamaTokenizer`
			`load_in_8bit: false`
			`load_in_4bit: false`
			`hub_model_id: winglian/llama-2-3b-cpt`
			`strict: false`
			`# pretraining_dataset: togethercomputer/RedPajama-Data-1T-Sample`
			`push_dataset_to_hub:`
			`datasets:`
			`- path: togethercomputer/RedPajama-Data-1T-Sample`
			`type: completion`
			`dataset_prepared_path: last_run_prepared`
			`val_set_size: 0.0`
			`adapter:`
			`lora_model_dir:`
			`sequence_len: 4096`
			`sample_packing:`
			`lora_r:`
			`lora_alpha:`
			`lora_dropout:`
			`lora_target_modules:`
			`lora_target_linear:`
			`lora_fan_in_fan_out:`
			`wandb_project: llama-2-3b-cpt`
			`wandb_entity:`
			`wandb_watch:`
			`wandb_run_id:`
			`wandb_log_model:`
			`output_dir: ./llama-2-3b-out`
			`# max_steps: 4400`
			`gradient_accumulation_steps: 3`
			`micro_batch_size: 8`
			`num_epochs: 1`
			`optimizer: adamw_torch`
			`adam_beta2: 0.95`
			`torchdistx_path:`
			`lr_scheduler: cosine`
			`learning_rate: 0.0001`
			`train_on_inputs: false`
			`group_by_length: false`
			`float16: true`
			`bf16: truc`
			`fp16: false`
			`tf32: true`
			`gradient_checkpointing: true`
			`early_stopping_patience:`
			`resume_from_checkpoint:`
			`local_rank:`
			`logging_steps: 1`
			`xformers_attention:`
			`flash_attention: true`
			`gptq_groupsize:`
			`gptq_model_v1:`
			`warmup_steps: 100`
			`eval_steps:`
			`save_steps: 100`
			`save_total_limit: 6`
			`debug:`
			`deepspeed:`
			`weight_decay: 0.1`
			`fsdp:`
			`fsdp_config:`
			`special_tokens:`
			`bos_token: "<s>"`
			`eos_token: "</s>"`
			`unk_token: "<unk>"`