283 lines
9.4 KiB
JSON
283 lines
9.4 KiB
JSON
|
|
{
|
||
|
|
"alternate_pp_config": false,
|
||
|
|
"async_checkpointing": true,
|
||
|
|
"async_checkpointing_staging_method": "sync",
|
||
|
|
"async_eval_ngpus": -1,
|
||
|
|
"async_evals_params": null,
|
||
|
|
"attach_debugpy": false,
|
||
|
|
"background_nccl_init": false,
|
||
|
|
"batch_p2p_communication": true,
|
||
|
|
"batch_size": 2,
|
||
|
|
"collect_et": false,
|
||
|
|
"context_parallel_size": 1,
|
||
|
|
"data": "/fsx-onellm/data/corpora/text_only/stackexchange:4.4,/fsx-onellm/data/corpora/text_only/b3g:7.2,/fsx-onellm/data/corpora/text_only/arxiv:2.8,/fsx-onellm/data/corpora/text_only/github_oss_with_stack:23.2,/fsx-onellm/data/corpora/text_only/c4:8.4,/fsx-onellm/data/corpora/text_only/edouard_cc_20220927_new:30,/fsx-onellm/data/corpora/text_only/ccnet_new:36,/fsx-onellm/data/corpora/text_only/wikipedia:4.3,/fsx-atom/melhoushi/data/openwebmath:2.2",
|
||
|
|
"deallocate_pipeline_outputs": true,
|
||
|
|
"disable_logging": false,
|
||
|
|
"disable_workers_print": false,
|
||
|
|
"dtype": "bf16",
|
||
|
|
"dump_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus_run000",
|
||
|
|
"dump_freq": 250,
|
||
|
|
"dump_profile_traces": false,
|
||
|
|
"enable_anomaly_detection": false,
|
||
|
|
"enable_deterministic_training": false,
|
||
|
|
"enable_loss_tracker": false,
|
||
|
|
"enable_ods": false,
|
||
|
|
"enable_pynvml": false,
|
||
|
|
"et_end_itr": 15,
|
||
|
|
"et_start_itr": 12,
|
||
|
|
"eval_freq": 5000,
|
||
|
|
"exp_id": "",
|
||
|
|
"exp_name": "",
|
||
|
|
"experimental": {
|
||
|
|
"early_exit": {
|
||
|
|
"calculate_cache": true,
|
||
|
|
"criterion": "oracle",
|
||
|
|
"criterion_args_str": null,
|
||
|
|
"disable_on_prompt": false,
|
||
|
|
"end_offset": 0,
|
||
|
|
"start_offset": 0
|
||
|
|
},
|
||
|
|
"early_exit_loss": {
|
||
|
|
"e_scale": 0.1,
|
||
|
|
"freeze_backbone": false,
|
||
|
|
"layers_str": "0::1",
|
||
|
|
"scale_type": "sum_l",
|
||
|
|
"share_fc": true,
|
||
|
|
"time_enable_mod": 8
|
||
|
|
},
|
||
|
|
"layer_dropout": null
|
||
|
|
},
|
||
|
|
"finetuning_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus_run000/checkpoints/checkpoint_0100000/",
|
||
|
|
"fp32_reduce_scatter": "all",
|
||
|
|
"gc_collect_freq": 1000,
|
||
|
|
"gpu_check_level": -1,
|
||
|
|
"increase_seq": null,
|
||
|
|
"instruct": {
|
||
|
|
"no_loss_prompt": false,
|
||
|
|
"no_loss_truncated": false
|
||
|
|
},
|
||
|
|
"instruct_data": "",
|
||
|
|
"iter_gopher": {
|
||
|
|
"buffer_size": 16,
|
||
|
|
"max_precompute": 10,
|
||
|
|
"n_chars_by_tok": 15,
|
||
|
|
"n_seqs_to_concat": 10,
|
||
|
|
"num_processes": 1
|
||
|
|
},
|
||
|
|
"iter_jsonl": {
|
||
|
|
"buffer_size": 64,
|
||
|
|
"same_data": false
|
||
|
|
},
|
||
|
|
"iter_multi": {
|
||
|
|
"buffer_size": 64,
|
||
|
|
"ignore_extra_chunks": true,
|
||
|
|
"iterate_chunk_by_chunk": false,
|
||
|
|
"max_precompute": 200,
|
||
|
|
"multiprocess": true
|
||
|
|
},
|
||
|
|
"iter_text_airstore": {
|
||
|
|
"airstore_seed": 0,
|
||
|
|
"buffer_size": 64,
|
||
|
|
"dataloader_workers_per_gpu": 16,
|
||
|
|
"prefetch_factor": 20,
|
||
|
|
"snapshot_every_n_steps": null
|
||
|
|
},
|
||
|
|
"iter_type": "multi",
|
||
|
|
"keep_eval_checkpoints": false,
|
||
|
|
"keep_n_last_checkpoints": 2,
|
||
|
|
"load_optimizer_on_finetuning": false,
|
||
|
|
"log": {
|
||
|
|
"log_scalars": false,
|
||
|
|
"log_scalars_to_ods": false,
|
||
|
|
"reduce_scalars": false
|
||
|
|
},
|
||
|
|
"log_all_steps": false,
|
||
|
|
"log_freq": 10,
|
||
|
|
"log_updates": true,
|
||
|
|
"loss_logging_freq": 10,
|
||
|
|
"loss_rescaling": false,
|
||
|
|
"mb_recompute_attn": false,
|
||
|
|
"mb_recompute_fc1_fc3": false,
|
||
|
|
"mem_snapshot_max_entries": 100000,
|
||
|
|
"mem_snapshot_stop_step": 3,
|
||
|
|
"model": {
|
||
|
|
"alpha_depth": "disabled",
|
||
|
|
"alpha_lrm": 1.0,
|
||
|
|
"alpha_separate": false,
|
||
|
|
"alpha_wdm": 1.0,
|
||
|
|
"attn_bias_type": "block_causal",
|
||
|
|
"attn_to_keep": "all",
|
||
|
|
"cp_attn_save_global_kv": true,
|
||
|
|
"custom_bwd": false,
|
||
|
|
"custom_bwd_sum_first_then_comms": true,
|
||
|
|
"default_init_router": false,
|
||
|
|
"dim": 2048,
|
||
|
|
"dropout": 0.0,
|
||
|
|
"efficient_attn": "auto",
|
||
|
|
"eos_id": 128001,
|
||
|
|
"experts_choice_moe": {
|
||
|
|
"auto_scale_F": true,
|
||
|
|
"capacity_factor": 1.0,
|
||
|
|
"eval_threshold_std_mult": 0.0,
|
||
|
|
"eval_with_saved_stats": false,
|
||
|
|
"eval_with_top_k": false,
|
||
|
|
"input_scaling": true,
|
||
|
|
"input_scaling_max_clamp": 2.0,
|
||
|
|
"input_scaling_min_clamp": -2.0,
|
||
|
|
"is_enabled": false,
|
||
|
|
"num_experts": 8,
|
||
|
|
"running_stats_ema": 0.95,
|
||
|
|
"running_stats_sync_freq": 100,
|
||
|
|
"use_shared_expert": true
|
||
|
|
},
|
||
|
|
"ffn_dim_multiplier": 1.5,
|
||
|
|
"ffn_exp": 4.0,
|
||
|
|
"fp8_amax_compute_algo": "max",
|
||
|
|
"fp8_amax_history_len": 1024,
|
||
|
|
"fp8_grad_output_dynamic_scale": false,
|
||
|
|
"fp8_input_dynamic_scale": false,
|
||
|
|
"fp8_interval": 1,
|
||
|
|
"fp8_margin": 0,
|
||
|
|
"fp8_wgrad": false,
|
||
|
|
"fsdp_checkpoint_wrap_layer_frequency": 1,
|
||
|
|
"fuse_sequence_parallel": false,
|
||
|
|
"high_freq_factor": 32,
|
||
|
|
"init": {
|
||
|
|
"coeff_std": null,
|
||
|
|
"depth_last": false,
|
||
|
|
"fixed_std": null,
|
||
|
|
"no_init": false,
|
||
|
|
"use_depth": "current",
|
||
|
|
"use_gaussian": true
|
||
|
|
},
|
||
|
|
"layer_ckpt": "all",
|
||
|
|
"less_layer_first_pp_stage": 0,
|
||
|
|
"less_layer_last_pp_stage": 0,
|
||
|
|
"linear_residual_dropout": true,
|
||
|
|
"local_attention_window_len": null,
|
||
|
|
"log_ffn_stats": false,
|
||
|
|
"loss_parallel": false,
|
||
|
|
"max_length": 2048,
|
||
|
|
"multiple_of": 256,
|
||
|
|
"n_heads": 32,
|
||
|
|
"n_kv_heads": 8,
|
||
|
|
"n_layers": 16,
|
||
|
|
"non_linearity": "swiglu",
|
||
|
|
"norm_affine": true,
|
||
|
|
"norm_eps": 1e-05,
|
||
|
|
"norm_type": "rmsnorm",
|
||
|
|
"number_of_experts": 1,
|
||
|
|
"output_size": 128256,
|
||
|
|
"pre_norm": true,
|
||
|
|
"recompute_attn": false,
|
||
|
|
"recompute_fc1_out": true,
|
||
|
|
"recompute_fc3_out": true,
|
||
|
|
"residual_dropout": 0.1,
|
||
|
|
"rope_attn_scale": false,
|
||
|
|
"rope_scale_factor": 8,
|
||
|
|
"rope_theta": 500000.0,
|
||
|
|
"rope_use_fp32_in_outer_product": true,
|
||
|
|
"sequence_parallel": true,
|
||
|
|
"share_emb": true,
|
||
|
|
"use_fp8": false,
|
||
|
|
"use_rope": true,
|
||
|
|
"use_scaled_rope": true,
|
||
|
|
"use_te_layers": false,
|
||
|
|
"vocab_parallel": true,
|
||
|
|
"vocab_size": 128256
|
||
|
|
},
|
||
|
|
"model_parallel_size": 1,
|
||
|
|
"no_final_ckpt": false,
|
||
|
|
"num_layers_per_virtual_pipeline_stage": null,
|
||
|
|
"num_microbatches_with_partial_activation_checkpoints": null,
|
||
|
|
"old_mp": -1,
|
||
|
|
"old_world_size": -1,
|
||
|
|
"optim": {
|
||
|
|
"annealing_step": 10000,
|
||
|
|
"beta1": 0.9,
|
||
|
|
"beta2": 0.95,
|
||
|
|
"clip": 1.0,
|
||
|
|
"cosine_theta": 1.0,
|
||
|
|
"cycle_length": 1.0,
|
||
|
|
"decay_length_fraction": 0.1,
|
||
|
|
"epsilon": 1e-08,
|
||
|
|
"exp_factor": 0.5,
|
||
|
|
"independent_weight_decay": false,
|
||
|
|
"lr": 2e-05,
|
||
|
|
"lr_min_ratio": 0.1,
|
||
|
|
"scheduler": "cosine",
|
||
|
|
"start_annealing_step": -1,
|
||
|
|
"use_fp32_copy_optim": true,
|
||
|
|
"warmup": 2000,
|
||
|
|
"weight_decay": 0.1
|
||
|
|
},
|
||
|
|
"optimize_backward_concat": false,
|
||
|
|
"overlap_p2p_communication": false,
|
||
|
|
"periodic_gpu_check": false,
|
||
|
|
"pipeline_parallel_microbatch_size": 1,
|
||
|
|
"pipeline_parallel_size": 1,
|
||
|
|
"pipeline_strategy": "dfs",
|
||
|
|
"profile_freq": -1,
|
||
|
|
"profile_record_shapes": false,
|
||
|
|
"profile_with_stack": false,
|
||
|
|
"py_spy_args": {
|
||
|
|
"active_seconds": 600,
|
||
|
|
"format": "flamegraph",
|
||
|
|
"freq": -1,
|
||
|
|
"rank0_only": true,
|
||
|
|
"rate": 50,
|
||
|
|
"start_offset": 10
|
||
|
|
},
|
||
|
|
"reshard_after_forward": true,
|
||
|
|
"restore_dataloader_position": false,
|
||
|
|
"root_dump_dir": "/fsx-atom/melhoushi/xldumps/xldumps",
|
||
|
|
"runtime_nccl_timeout_s": 180,
|
||
|
|
"seq_len": 8192,
|
||
|
|
"skip_evals_during_training": false,
|
||
|
|
"slurm": {
|
||
|
|
"global_rank": 0,
|
||
|
|
"is_slurm_job": true,
|
||
|
|
"world_size": 128
|
||
|
|
},
|
||
|
|
"steps": 100000,
|
||
|
|
"tokenizer": {
|
||
|
|
"basic_special_tokens": [
|
||
|
|
"<|begin_of_text|>",
|
||
|
|
"<|end_of_text|>",
|
||
|
|
"<|fim_prefix|>",
|
||
|
|
"<|fim_middle|>",
|
||
|
|
"<|fim_suffix|>"
|
||
|
|
],
|
||
|
|
"directory": "/fsx-onellm/myasu/models/llama3/Meta-Llama-3-8B/original_fsdp/",
|
||
|
|
"extra_special_tokens": [],
|
||
|
|
"model": "cl_toplang_128k",
|
||
|
|
"num_reserved_special_tokens": 256,
|
||
|
|
"pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||
|
|
"tokenizer_cls": "TiktokenTokenizer"
|
||
|
|
},
|
||
|
|
"tokenizer_dir": "",
|
||
|
|
"torch_seed": 0,
|
||
|
|
"unlimited_steps": false,
|
||
|
|
"use_checkpointing_process": false,
|
||
|
|
"use_sum_loss": false,
|
||
|
|
"valid": {
|
||
|
|
"batch_size": 4,
|
||
|
|
"debug": false,
|
||
|
|
"majority_voting": 0,
|
||
|
|
"n_batches": 100,
|
||
|
|
"ppl_files_str": "/fsx-labs/broz/data/shuffled/stackv1/stack_v1_2023_02_27_test/stack_test_file.jsonl,/fsx-labs/broz/datasets/wikipedia/processed/en/wikipedia_en_all_maxi_2023-02.jsonl,/fsx-labs/broz/datasets/gutenberg/processed/selected_books.jsonl",
|
||
|
|
"prompt_path": "",
|
||
|
|
"random_fewshots": false,
|
||
|
|
"seed": 42,
|
||
|
|
"seq_len": 2048,
|
||
|
|
"skip_sanity_check": false,
|
||
|
|
"tasks_root_dir": "/fsx-labs/broz/data/large_experiments/theorem/datasets/eval",
|
||
|
|
"tasks_str": "human_eval,piqa,siqa,hellaswag",
|
||
|
|
"temperature": 0.0,
|
||
|
|
"top_k": 0,
|
||
|
|
"top_p": 0.0,
|
||
|
|
"use_sampling": false,
|
||
|
|
"write_eval": false
|
||
|
|
},
|
||
|
|
"z_loss_multiplier": 0.0
|
||
|
|
}
|