初始化项目,由ModelHub XC社区提供模型
Model: AI-ModelScope/layerskip-llama3.2-1B Source: Original Platform
This commit is contained in:
283
original/params.json
Normal file
283
original/params.json
Normal file
@@ -0,0 +1,283 @@
|
||||
{
|
||||
"alternate_pp_config": false,
|
||||
"async_checkpointing": true,
|
||||
"async_checkpointing_staging_method": "sync",
|
||||
"async_eval_ngpus": -1,
|
||||
"async_evals_params": null,
|
||||
"attach_debugpy": false,
|
||||
"background_nccl_init": false,
|
||||
"batch_p2p_communication": true,
|
||||
"batch_size": 2,
|
||||
"collect_et": false,
|
||||
"context_parallel_size": 1,
|
||||
"data": "/fsx-onellm/data/corpora/text_only/stackexchange:4.4,/fsx-onellm/data/corpora/text_only/b3g:7.2,/fsx-onellm/data/corpora/text_only/arxiv:2.8,/fsx-onellm/data/corpora/text_only/github_oss_with_stack:23.2,/fsx-onellm/data/corpora/text_only/c4:8.4,/fsx-onellm/data/corpora/text_only/edouard_cc_20220927_new:30,/fsx-onellm/data/corpora/text_only/ccnet_new:36,/fsx-onellm/data/corpora/text_only/wikipedia:4.3,/fsx-atom/melhoushi/data/openwebmath:2.2",
|
||||
"deallocate_pipeline_outputs": true,
|
||||
"disable_logging": false,
|
||||
"disable_workers_print": false,
|
||||
"dtype": "bf16",
|
||||
"dump_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus_run000",
|
||||
"dump_freq": 250,
|
||||
"dump_profile_traces": false,
|
||||
"enable_anomaly_detection": false,
|
||||
"enable_deterministic_training": false,
|
||||
"enable_loss_tracker": false,
|
||||
"enable_ods": false,
|
||||
"enable_pynvml": false,
|
||||
"et_end_itr": 15,
|
||||
"et_start_itr": 12,
|
||||
"eval_freq": 5000,
|
||||
"exp_id": "",
|
||||
"exp_name": "",
|
||||
"experimental": {
|
||||
"early_exit": {
|
||||
"calculate_cache": true,
|
||||
"criterion": "oracle",
|
||||
"criterion_args_str": null,
|
||||
"disable_on_prompt": false,
|
||||
"end_offset": 0,
|
||||
"start_offset": 0
|
||||
},
|
||||
"early_exit_loss": {
|
||||
"e_scale": 0.1,
|
||||
"freeze_backbone": false,
|
||||
"layers_str": "0::1",
|
||||
"scale_type": "sum_l",
|
||||
"share_fc": true,
|
||||
"time_enable_mod": 8
|
||||
},
|
||||
"layer_dropout": null
|
||||
},
|
||||
"finetuning_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus_run000/checkpoints/checkpoint_0100000/",
|
||||
"fp32_reduce_scatter": "all",
|
||||
"gc_collect_freq": 1000,
|
||||
"gpu_check_level": -1,
|
||||
"increase_seq": null,
|
||||
"instruct": {
|
||||
"no_loss_prompt": false,
|
||||
"no_loss_truncated": false
|
||||
},
|
||||
"instruct_data": "",
|
||||
"iter_gopher": {
|
||||
"buffer_size": 16,
|
||||
"max_precompute": 10,
|
||||
"n_chars_by_tok": 15,
|
||||
"n_seqs_to_concat": 10,
|
||||
"num_processes": 1
|
||||
},
|
||||
"iter_jsonl": {
|
||||
"buffer_size": 64,
|
||||
"same_data": false
|
||||
},
|
||||
"iter_multi": {
|
||||
"buffer_size": 64,
|
||||
"ignore_extra_chunks": true,
|
||||
"iterate_chunk_by_chunk": false,
|
||||
"max_precompute": 200,
|
||||
"multiprocess": true
|
||||
},
|
||||
"iter_text_airstore": {
|
||||
"airstore_seed": 0,
|
||||
"buffer_size": 64,
|
||||
"dataloader_workers_per_gpu": 16,
|
||||
"prefetch_factor": 20,
|
||||
"snapshot_every_n_steps": null
|
||||
},
|
||||
"iter_type": "multi",
|
||||
"keep_eval_checkpoints": false,
|
||||
"keep_n_last_checkpoints": 2,
|
||||
"load_optimizer_on_finetuning": false,
|
||||
"log": {
|
||||
"log_scalars": false,
|
||||
"log_scalars_to_ods": false,
|
||||
"reduce_scalars": false
|
||||
},
|
||||
"log_all_steps": false,
|
||||
"log_freq": 10,
|
||||
"log_updates": true,
|
||||
"loss_logging_freq": 10,
|
||||
"loss_rescaling": false,
|
||||
"mb_recompute_attn": false,
|
||||
"mb_recompute_fc1_fc3": false,
|
||||
"mem_snapshot_max_entries": 100000,
|
||||
"mem_snapshot_stop_step": 3,
|
||||
"model": {
|
||||
"alpha_depth": "disabled",
|
||||
"alpha_lrm": 1.0,
|
||||
"alpha_separate": false,
|
||||
"alpha_wdm": 1.0,
|
||||
"attn_bias_type": "block_causal",
|
||||
"attn_to_keep": "all",
|
||||
"cp_attn_save_global_kv": true,
|
||||
"custom_bwd": false,
|
||||
"custom_bwd_sum_first_then_comms": true,
|
||||
"default_init_router": false,
|
||||
"dim": 2048,
|
||||
"dropout": 0.0,
|
||||
"efficient_attn": "auto",
|
||||
"eos_id": 128001,
|
||||
"experts_choice_moe": {
|
||||
"auto_scale_F": true,
|
||||
"capacity_factor": 1.0,
|
||||
"eval_threshold_std_mult": 0.0,
|
||||
"eval_with_saved_stats": false,
|
||||
"eval_with_top_k": false,
|
||||
"input_scaling": true,
|
||||
"input_scaling_max_clamp": 2.0,
|
||||
"input_scaling_min_clamp": -2.0,
|
||||
"is_enabled": false,
|
||||
"num_experts": 8,
|
||||
"running_stats_ema": 0.95,
|
||||
"running_stats_sync_freq": 100,
|
||||
"use_shared_expert": true
|
||||
},
|
||||
"ffn_dim_multiplier": 1.5,
|
||||
"ffn_exp": 4.0,
|
||||
"fp8_amax_compute_algo": "max",
|
||||
"fp8_amax_history_len": 1024,
|
||||
"fp8_grad_output_dynamic_scale": false,
|
||||
"fp8_input_dynamic_scale": false,
|
||||
"fp8_interval": 1,
|
||||
"fp8_margin": 0,
|
||||
"fp8_wgrad": false,
|
||||
"fsdp_checkpoint_wrap_layer_frequency": 1,
|
||||
"fuse_sequence_parallel": false,
|
||||
"high_freq_factor": 32,
|
||||
"init": {
|
||||
"coeff_std": null,
|
||||
"depth_last": false,
|
||||
"fixed_std": null,
|
||||
"no_init": false,
|
||||
"use_depth": "current",
|
||||
"use_gaussian": true
|
||||
},
|
||||
"layer_ckpt": "all",
|
||||
"less_layer_first_pp_stage": 0,
|
||||
"less_layer_last_pp_stage": 0,
|
||||
"linear_residual_dropout": true,
|
||||
"local_attention_window_len": null,
|
||||
"log_ffn_stats": false,
|
||||
"loss_parallel": false,
|
||||
"max_length": 2048,
|
||||
"multiple_of": 256,
|
||||
"n_heads": 32,
|
||||
"n_kv_heads": 8,
|
||||
"n_layers": 16,
|
||||
"non_linearity": "swiglu",
|
||||
"norm_affine": true,
|
||||
"norm_eps": 1e-05,
|
||||
"norm_type": "rmsnorm",
|
||||
"number_of_experts": 1,
|
||||
"output_size": 128256,
|
||||
"pre_norm": true,
|
||||
"recompute_attn": false,
|
||||
"recompute_fc1_out": true,
|
||||
"recompute_fc3_out": true,
|
||||
"residual_dropout": 0.1,
|
||||
"rope_attn_scale": false,
|
||||
"rope_scale_factor": 8,
|
||||
"rope_theta": 500000.0,
|
||||
"rope_use_fp32_in_outer_product": true,
|
||||
"sequence_parallel": true,
|
||||
"share_emb": true,
|
||||
"use_fp8": false,
|
||||
"use_rope": true,
|
||||
"use_scaled_rope": true,
|
||||
"use_te_layers": false,
|
||||
"vocab_parallel": true,
|
||||
"vocab_size": 128256
|
||||
},
|
||||
"model_parallel_size": 1,
|
||||
"no_final_ckpt": false,
|
||||
"num_layers_per_virtual_pipeline_stage": null,
|
||||
"num_microbatches_with_partial_activation_checkpoints": null,
|
||||
"old_mp": -1,
|
||||
"old_world_size": -1,
|
||||
"optim": {
|
||||
"annealing_step": 10000,
|
||||
"beta1": 0.9,
|
||||
"beta2": 0.95,
|
||||
"clip": 1.0,
|
||||
"cosine_theta": 1.0,
|
||||
"cycle_length": 1.0,
|
||||
"decay_length_fraction": 0.1,
|
||||
"epsilon": 1e-08,
|
||||
"exp_factor": 0.5,
|
||||
"independent_weight_decay": false,
|
||||
"lr": 2e-05,
|
||||
"lr_min_ratio": 0.1,
|
||||
"scheduler": "cosine",
|
||||
"start_annealing_step": -1,
|
||||
"use_fp32_copy_optim": true,
|
||||
"warmup": 2000,
|
||||
"weight_decay": 0.1
|
||||
},
|
||||
"optimize_backward_concat": false,
|
||||
"overlap_p2p_communication": false,
|
||||
"periodic_gpu_check": false,
|
||||
"pipeline_parallel_microbatch_size": 1,
|
||||
"pipeline_parallel_size": 1,
|
||||
"pipeline_strategy": "dfs",
|
||||
"profile_freq": -1,
|
||||
"profile_record_shapes": false,
|
||||
"profile_with_stack": false,
|
||||
"py_spy_args": {
|
||||
"active_seconds": 600,
|
||||
"format": "flamegraph",
|
||||
"freq": -1,
|
||||
"rank0_only": true,
|
||||
"rate": 50,
|
||||
"start_offset": 10
|
||||
},
|
||||
"reshard_after_forward": true,
|
||||
"restore_dataloader_position": false,
|
||||
"root_dump_dir": "/fsx-atom/melhoushi/xldumps/xldumps",
|
||||
"runtime_nccl_timeout_s": 180,
|
||||
"seq_len": 8192,
|
||||
"skip_evals_during_training": false,
|
||||
"slurm": {
|
||||
"global_rank": 0,
|
||||
"is_slurm_job": true,
|
||||
"world_size": 128
|
||||
},
|
||||
"steps": 100000,
|
||||
"tokenizer": {
|
||||
"basic_special_tokens": [
|
||||
"<|begin_of_text|>",
|
||||
"<|end_of_text|>",
|
||||
"<|fim_prefix|>",
|
||||
"<|fim_middle|>",
|
||||
"<|fim_suffix|>"
|
||||
],
|
||||
"directory": "/fsx-onellm/myasu/models/llama3/Meta-Llama-3-8B/original_fsdp/",
|
||||
"extra_special_tokens": [],
|
||||
"model": "cl_toplang_128k",
|
||||
"num_reserved_special_tokens": 256,
|
||||
"pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
"tokenizer_cls": "TiktokenTokenizer"
|
||||
},
|
||||
"tokenizer_dir": "",
|
||||
"torch_seed": 0,
|
||||
"unlimited_steps": false,
|
||||
"use_checkpointing_process": false,
|
||||
"use_sum_loss": false,
|
||||
"valid": {
|
||||
"batch_size": 4,
|
||||
"debug": false,
|
||||
"majority_voting": 0,
|
||||
"n_batches": 100,
|
||||
"ppl_files_str": "/fsx-labs/broz/data/shuffled/stackv1/stack_v1_2023_02_27_test/stack_test_file.jsonl,/fsx-labs/broz/datasets/wikipedia/processed/en/wikipedia_en_all_maxi_2023-02.jsonl,/fsx-labs/broz/datasets/gutenberg/processed/selected_books.jsonl",
|
||||
"prompt_path": "",
|
||||
"random_fewshots": false,
|
||||
"seed": 42,
|
||||
"seq_len": 2048,
|
||||
"skip_sanity_check": false,
|
||||
"tasks_root_dir": "/fsx-labs/broz/data/large_experiments/theorem/datasets/eval",
|
||||
"tasks_str": "human_eval,piqa,siqa,hellaswag",
|
||||
"temperature": 0.0,
|
||||
"top_k": 0,
|
||||
"top_p": 0.0,
|
||||
"use_sampling": false,
|
||||
"write_eval": false
|
||||
},
|
||||
"z_loss_multiplier": 0.0
|
||||
}
|
||||
Reference in New Issue
Block a user