{ "alternate_pp_config": false, "async_checkpointing": true, "async_checkpointing_staging_method": "sync", "async_eval_ngpus": -1, "async_evals_params": null, "attach_debugpy": false, "background_nccl_init": false, "batch_p2p_communication": true, "batch_size": 2, "collect_et": false, "context_parallel_size": 1, "data": "/fsx-onellm/data/corpora/text_only/stackexchange:4.4,/fsx-onellm/data/corpora/text_only/b3g:7.2,/fsx-onellm/data/corpora/text_only/arxiv:2.8,/fsx-onellm/data/corpora/text_only/github_oss_with_stack:23.2,/fsx-onellm/data/corpora/text_only/c4:8.4,/fsx-onellm/data/corpora/text_only/edouard_cc_20220927_new:30,/fsx-onellm/data/corpora/text_only/ccnet_new:36,/fsx-onellm/data/corpora/text_only/wikipedia:4.3,/fsx-atom/melhoushi/data/openwebmath:2.2", "deallocate_pipeline_outputs": true, "disable_logging": false, "disable_workers_print": false, "dtype": "bf16", "dump_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus_run000", "dump_freq": 250, "dump_profile_traces": false, "enable_anomaly_detection": false, "enable_deterministic_training": false, "enable_loss_tracker": false, "enable_ods": false, "enable_pynvml": false, "et_end_itr": 15, "et_start_itr": 12, "eval_freq": 5000, "exp_id": "", "exp_name": "", "experimental": { "early_exit": { "calculate_cache": true, "criterion": "oracle", "criterion_args_str": null, "disable_on_prompt": false, "end_offset": 0, "start_offset": 0 }, "early_exit_loss": { "e_scale": 0.1, "freeze_backbone": false, "layers_str": "0::1", "scale_type": "sum_l", "share_fc": true, "time_enable_mod": 8 }, "layer_dropout": null }, "finetuning_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus_run000/checkpoints/checkpoint_0100000/", "fp32_reduce_scatter": "all", "gc_collect_freq": 1000, "gpu_check_level": -1, "increase_seq": null, "instruct": { "no_loss_prompt": false, "no_loss_truncated": false }, "instruct_data": "", "iter_gopher": { "buffer_size": 16, "max_precompute": 10, "n_chars_by_tok": 15, "n_seqs_to_concat": 10, "num_processes": 1 }, "iter_jsonl": { "buffer_size": 64, "same_data": false }, "iter_multi": { "buffer_size": 64, "ignore_extra_chunks": true, "iterate_chunk_by_chunk": false, "max_precompute": 200, "multiprocess": true }, "iter_text_airstore": { "airstore_seed": 0, "buffer_size": 64, "dataloader_workers_per_gpu": 16, "prefetch_factor": 20, "snapshot_every_n_steps": null }, "iter_type": "multi", "keep_eval_checkpoints": false, "keep_n_last_checkpoints": 2, "load_optimizer_on_finetuning": false, "log": { "log_scalars": false, "log_scalars_to_ods": false, "reduce_scalars": false }, "log_all_steps": false, "log_freq": 10, "log_updates": true, "loss_logging_freq": 10, "loss_rescaling": false, "mb_recompute_attn": false, "mb_recompute_fc1_fc3": false, "mem_snapshot_max_entries": 100000, "mem_snapshot_stop_step": 3, "model": { "alpha_depth": "disabled", "alpha_lrm": 1.0, "alpha_separate": false, "alpha_wdm": 1.0, "attn_bias_type": "block_causal", "attn_to_keep": "all", "cp_attn_save_global_kv": true, "custom_bwd": false, "custom_bwd_sum_first_then_comms": true, "default_init_router": false, "dim": 2048, "dropout": 0.0, "efficient_attn": "auto", "eos_id": 128001, "experts_choice_moe": { "auto_scale_F": true, "capacity_factor": 1.0, "eval_threshold_std_mult": 0.0, "eval_with_saved_stats": false, "eval_with_top_k": false, "input_scaling": true, "input_scaling_max_clamp": 2.0, "input_scaling_min_clamp": -2.0, "is_enabled": false, "num_experts": 8, "running_stats_ema": 0.95, "running_stats_sync_freq": 100, "use_shared_expert": true }, "ffn_dim_multiplier": 1.5, "ffn_exp": 4.0, "fp8_amax_compute_algo": "max", "fp8_amax_history_len": 1024, "fp8_grad_output_dynamic_scale": false, "fp8_input_dynamic_scale": false, "fp8_interval": 1, "fp8_margin": 0, "fp8_wgrad": false, "fsdp_checkpoint_wrap_layer_frequency": 1, "fuse_sequence_parallel": false, "high_freq_factor": 32, "init": { "coeff_std": null, "depth_last": false, "fixed_std": null, "no_init": false, "use_depth": "current", "use_gaussian": true }, "layer_ckpt": "all", "less_layer_first_pp_stage": 0, "less_layer_last_pp_stage": 0, "linear_residual_dropout": true, "local_attention_window_len": null, "log_ffn_stats": false, "loss_parallel": false, "max_length": 2048, "multiple_of": 256, "n_heads": 32, "n_kv_heads": 8, "n_layers": 16, "non_linearity": "swiglu", "norm_affine": true, "norm_eps": 1e-05, "norm_type": "rmsnorm", "number_of_experts": 1, "output_size": 128256, "pre_norm": true, "recompute_attn": false, "recompute_fc1_out": true, "recompute_fc3_out": true, "residual_dropout": 0.1, "rope_attn_scale": false, "rope_scale_factor": 8, "rope_theta": 500000.0, "rope_use_fp32_in_outer_product": true, "sequence_parallel": true, "share_emb": true, "use_fp8": false, "use_rope": true, "use_scaled_rope": true, "use_te_layers": false, "vocab_parallel": true, "vocab_size": 128256 }, "model_parallel_size": 1, "no_final_ckpt": false, "num_layers_per_virtual_pipeline_stage": null, "num_microbatches_with_partial_activation_checkpoints": null, "old_mp": -1, "old_world_size": -1, "optim": { "annealing_step": 10000, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "cosine_theta": 1.0, "cycle_length": 1.0, "decay_length_fraction": 0.1, "epsilon": 1e-08, "exp_factor": 0.5, "independent_weight_decay": false, "lr": 2e-05, "lr_min_ratio": 0.1, "scheduler": "cosine", "start_annealing_step": -1, "use_fp32_copy_optim": true, "warmup": 2000, "weight_decay": 0.1 }, "optimize_backward_concat": false, "overlap_p2p_communication": false, "periodic_gpu_check": false, "pipeline_parallel_microbatch_size": 1, "pipeline_parallel_size": 1, "pipeline_strategy": "dfs", "profile_freq": -1, "profile_record_shapes": false, "profile_with_stack": false, "py_spy_args": { "active_seconds": 600, "format": "flamegraph", "freq": -1, "rank0_only": true, "rate": 50, "start_offset": 10 }, "reshard_after_forward": true, "restore_dataloader_position": false, "root_dump_dir": "/fsx-atom/melhoushi/xldumps/xldumps", "runtime_nccl_timeout_s": 180, "seq_len": 8192, "skip_evals_during_training": false, "slurm": { "global_rank": 0, "is_slurm_job": true, "world_size": 128 }, "steps": 100000, "tokenizer": { "basic_special_tokens": [ "<|begin_of_text|>", "<|end_of_text|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>" ], "directory": "/fsx-onellm/myasu/models/llama3/Meta-Llama-3-8B/original_fsdp/", "extra_special_tokens": [], "model": "cl_toplang_128k", "num_reserved_special_tokens": 256, "pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "tokenizer_cls": "TiktokenTokenizer" }, "tokenizer_dir": "", "torch_seed": 0, "unlimited_steps": false, "use_checkpointing_process": false, "use_sum_loss": false, "valid": { "batch_size": 4, "debug": false, "majority_voting": 0, "n_batches": 100, "ppl_files_str": "/fsx-labs/broz/data/shuffled/stackv1/stack_v1_2023_02_27_test/stack_test_file.jsonl,/fsx-labs/broz/datasets/wikipedia/processed/en/wikipedia_en_all_maxi_2023-02.jsonl,/fsx-labs/broz/datasets/gutenberg/processed/selected_books.jsonl", "prompt_path": "", "random_fewshots": false, "seed": 42, "seq_len": 2048, "skip_sanity_check": false, "tasks_root_dir": "/fsx-labs/broz/data/large_experiments/theorem/datasets/eval", "tasks_str": "human_eval,piqa,siqa,hellaswag", "temperature": 0.0, "top_k": 0, "top_p": 0.0, "use_sampling": false, "write_eval": false }, "z_loss_multiplier": 0.0 }