Files
ModelHub XC bd860a1a5f 初始化项目,由ModelHub XC社区提供模型
Model: LLM-Research/layerskip-llama3.2-1B
Source: Original Platform
2026-05-15 18:39:45 +08:00

283 lines
9.4 KiB
JSON

{
"alternate_pp_config": false,
"async_checkpointing": true,
"async_checkpointing_staging_method": "sync",
"async_eval_ngpus": -1,
"async_evals_params": null,
"attach_debugpy": false,
"background_nccl_init": false,
"batch_p2p_communication": true,
"batch_size": 2,
"collect_et": false,
"context_parallel_size": 1,
"data": "/fsx-onellm/data/corpora/text_only/stackexchange:4.4,/fsx-onellm/data/corpora/text_only/b3g:7.2,/fsx-onellm/data/corpora/text_only/arxiv:2.8,/fsx-onellm/data/corpora/text_only/github_oss_with_stack:23.2,/fsx-onellm/data/corpora/text_only/c4:8.4,/fsx-onellm/data/corpora/text_only/edouard_cc_20220927_new:30,/fsx-onellm/data/corpora/text_only/ccnet_new:36,/fsx-onellm/data/corpora/text_only/wikipedia:4.3,/fsx-atom/melhoushi/data/openwebmath:2.2",
"deallocate_pipeline_outputs": true,
"disable_logging": false,
"disable_workers_print": false,
"dtype": "bf16",
"dump_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus_run000",
"dump_freq": 250,
"dump_profile_traces": false,
"enable_anomaly_detection": false,
"enable_deterministic_training": false,
"enable_loss_tracker": false,
"enable_ods": false,
"enable_pynvml": false,
"et_end_itr": 15,
"et_start_itr": 12,
"eval_freq": 5000,
"exp_id": "",
"exp_name": "",
"experimental": {
"early_exit": {
"calculate_cache": true,
"criterion": "oracle",
"criterion_args_str": null,
"disable_on_prompt": false,
"end_offset": 0,
"start_offset": 0
},
"early_exit_loss": {
"e_scale": 0.1,
"freeze_backbone": false,
"layers_str": "0::1",
"scale_type": "sum_l",
"share_fc": true,
"time_enable_mod": 8
},
"layer_dropout": null
},
"finetuning_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus_run000/checkpoints/checkpoint_0100000/",
"fp32_reduce_scatter": "all",
"gc_collect_freq": 1000,
"gpu_check_level": -1,
"increase_seq": null,
"instruct": {
"no_loss_prompt": false,
"no_loss_truncated": false
},
"instruct_data": "",
"iter_gopher": {
"buffer_size": 16,
"max_precompute": 10,
"n_chars_by_tok": 15,
"n_seqs_to_concat": 10,
"num_processes": 1
},
"iter_jsonl": {
"buffer_size": 64,
"same_data": false
},
"iter_multi": {
"buffer_size": 64,
"ignore_extra_chunks": true,
"iterate_chunk_by_chunk": false,
"max_precompute": 200,
"multiprocess": true
},
"iter_text_airstore": {
"airstore_seed": 0,
"buffer_size": 64,
"dataloader_workers_per_gpu": 16,
"prefetch_factor": 20,
"snapshot_every_n_steps": null
},
"iter_type": "multi",
"keep_eval_checkpoints": false,
"keep_n_last_checkpoints": 2,
"load_optimizer_on_finetuning": false,
"log": {
"log_scalars": false,
"log_scalars_to_ods": false,
"reduce_scalars": false
},
"log_all_steps": false,
"log_freq": 10,
"log_updates": true,
"loss_logging_freq": 10,
"loss_rescaling": false,
"mb_recompute_attn": false,
"mb_recompute_fc1_fc3": false,
"mem_snapshot_max_entries": 100000,
"mem_snapshot_stop_step": 3,
"model": {
"alpha_depth": "disabled",
"alpha_lrm": 1.0,
"alpha_separate": false,
"alpha_wdm": 1.0,
"attn_bias_type": "block_causal",
"attn_to_keep": "all",
"cp_attn_save_global_kv": true,
"custom_bwd": false,
"custom_bwd_sum_first_then_comms": true,
"default_init_router": false,
"dim": 2048,
"dropout": 0.0,
"efficient_attn": "auto",
"eos_id": 128001,
"experts_choice_moe": {
"auto_scale_F": true,
"capacity_factor": 1.0,
"eval_threshold_std_mult": 0.0,
"eval_with_saved_stats": false,
"eval_with_top_k": false,
"input_scaling": true,
"input_scaling_max_clamp": 2.0,
"input_scaling_min_clamp": -2.0,
"is_enabled": false,
"num_experts": 8,
"running_stats_ema": 0.95,
"running_stats_sync_freq": 100,
"use_shared_expert": true
},
"ffn_dim_multiplier": 1.5,
"ffn_exp": 4.0,
"fp8_amax_compute_algo": "max",
"fp8_amax_history_len": 1024,
"fp8_grad_output_dynamic_scale": false,
"fp8_input_dynamic_scale": false,
"fp8_interval": 1,
"fp8_margin": 0,
"fp8_wgrad": false,
"fsdp_checkpoint_wrap_layer_frequency": 1,
"fuse_sequence_parallel": false,
"high_freq_factor": 32,
"init": {
"coeff_std": null,
"depth_last": false,
"fixed_std": null,
"no_init": false,
"use_depth": "current",
"use_gaussian": true
},
"layer_ckpt": "all",
"less_layer_first_pp_stage": 0,
"less_layer_last_pp_stage": 0,
"linear_residual_dropout": true,
"local_attention_window_len": null,
"log_ffn_stats": false,
"loss_parallel": false,
"max_length": 2048,
"multiple_of": 256,
"n_heads": 32,
"n_kv_heads": 8,
"n_layers": 16,
"non_linearity": "swiglu",
"norm_affine": true,
"norm_eps": 1e-05,
"norm_type": "rmsnorm",
"number_of_experts": 1,
"output_size": 128256,
"pre_norm": true,
"recompute_attn": false,
"recompute_fc1_out": true,
"recompute_fc3_out": true,
"residual_dropout": 0.1,
"rope_attn_scale": false,
"rope_scale_factor": 8,
"rope_theta": 500000.0,
"rope_use_fp32_in_outer_product": true,
"sequence_parallel": true,
"share_emb": true,
"use_fp8": false,
"use_rope": true,
"use_scaled_rope": true,
"use_te_layers": false,
"vocab_parallel": true,
"vocab_size": 128256
},
"model_parallel_size": 1,
"no_final_ckpt": false,
"num_layers_per_virtual_pipeline_stage": null,
"num_microbatches_with_partial_activation_checkpoints": null,
"old_mp": -1,
"old_world_size": -1,
"optim": {
"annealing_step": 10000,
"beta1": 0.9,
"beta2": 0.95,
"clip": 1.0,
"cosine_theta": 1.0,
"cycle_length": 1.0,
"decay_length_fraction": 0.1,
"epsilon": 1e-08,
"exp_factor": 0.5,
"independent_weight_decay": false,
"lr": 2e-05,
"lr_min_ratio": 0.1,
"scheduler": "cosine",
"start_annealing_step": -1,
"use_fp32_copy_optim": true,
"warmup": 2000,
"weight_decay": 0.1
},
"optimize_backward_concat": false,
"overlap_p2p_communication": false,
"periodic_gpu_check": false,
"pipeline_parallel_microbatch_size": 1,
"pipeline_parallel_size": 1,
"pipeline_strategy": "dfs",
"profile_freq": -1,
"profile_record_shapes": false,
"profile_with_stack": false,
"py_spy_args": {
"active_seconds": 600,
"format": "flamegraph",
"freq": -1,
"rank0_only": true,
"rate": 50,
"start_offset": 10
},
"reshard_after_forward": true,
"restore_dataloader_position": false,
"root_dump_dir": "/fsx-atom/melhoushi/xldumps/xldumps",
"runtime_nccl_timeout_s": 180,
"seq_len": 8192,
"skip_evals_during_training": false,
"slurm": {
"global_rank": 0,
"is_slurm_job": true,
"world_size": 128
},
"steps": 100000,
"tokenizer": {
"basic_special_tokens": [
"<|begin_of_text|>",
"<|end_of_text|>",
"<|fim_prefix|>",
"<|fim_middle|>",
"<|fim_suffix|>"
],
"directory": "/fsx-onellm/myasu/models/llama3/Meta-Llama-3-8B/original_fsdp/",
"extra_special_tokens": [],
"model": "cl_toplang_128k",
"num_reserved_special_tokens": 256,
"pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
"tokenizer_cls": "TiktokenTokenizer"
},
"tokenizer_dir": "",
"torch_seed": 0,
"unlimited_steps": false,
"use_checkpointing_process": false,
"use_sum_loss": false,
"valid": {
"batch_size": 4,
"debug": false,
"majority_voting": 0,
"n_batches": 100,
"ppl_files_str": "/fsx-labs/broz/data/shuffled/stackv1/stack_v1_2023_02_27_test/stack_test_file.jsonl,/fsx-labs/broz/datasets/wikipedia/processed/en/wikipedia_en_all_maxi_2023-02.jsonl,/fsx-labs/broz/datasets/gutenberg/processed/selected_books.jsonl",
"prompt_path": "",
"random_fewshots": false,
"seed": 42,
"seq_len": 2048,
"skip_sanity_check": false,
"tasks_root_dir": "/fsx-labs/broz/data/large_experiments/theorem/datasets/eval",
"tasks_str": "human_eval,piqa,siqa,hellaswag",
"temperature": 0.0,
"top_k": 0,
"top_p": 0.0,
"use_sampling": false,
"write_eval": false
},
"z_loss_multiplier": 0.0
}