layerskip-llama3.2-1B/original/params.json

{
    "alternate_pp_config": false,
    "async_checkpointing": true,
    "async_checkpointing_staging_method": "sync",
    "async_eval_ngpus": -1,
    "async_evals_params": null,
    "attach_debugpy": false,
    "background_nccl_init": false,
    "batch_p2p_communication": true,
    "batch_size": 2,
    "collect_et": false,
    "context_parallel_size": 1,
    "data": "/fsx-onellm/data/corpora/text_only/stackexchange:4.4,/fsx-onellm/data/corpora/text_only/b3g:7.2,/fsx-onellm/data/corpora/text_only/arxiv:2.8,/fsx-onellm/data/corpora/text_only/github_oss_with_stack:23.2,/fsx-onellm/data/corpora/text_only/c4:8.4,/fsx-onellm/data/corpora/text_only/edouard_cc_20220927_new:30,/fsx-onellm/data/corpora/text_only/ccnet_new:36,/fsx-onellm/data/corpora/text_only/wikipedia:4.3,/fsx-atom/melhoushi/data/openwebmath:2.2",
    "deallocate_pipeline_outputs": true,
    "disable_logging": false,
    "disable_workers_print": false,
    "dtype": "bf16",
    "dump_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_128_gpus_run000",
    "dump_freq": 250,
    "dump_profile_traces": false,
    "enable_anomaly_detection": false,
    "enable_deterministic_training": false,
    "enable_loss_tracker": false,
    "enable_ods": false,
    "enable_pynvml": false,
    "et_end_itr": 15,
    "et_start_itr": 12,
    "eval_freq": 5000,
    "exp_id": "",
    "exp_name": "",
    "experimental": {
        "early_exit": {
            "calculate_cache": true,
            "criterion": "oracle",
            "criterion_args_str": null,
            "disable_on_prompt": false,
            "end_offset": 0,
            "start_offset": 0
        },
        "early_exit_loss": {
            "e_scale": 0.1,
            "freeze_backbone": false,
            "layers_str": "0::1",
            "scale_type": "sum_l",
            "share_fc": true,
            "time_enable_mod": 8
        },
        "layer_dropout": null
    },
    "finetuning_dir": "/fsx-atom/melhoushi/xldumps/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus/continual_llama3.2_1B_gpus_64_shared_emb_higher_lr_ee_0.1_ld_0.1_96_gpus_run000/checkpoints/checkpoint_0100000/",
    "fp32_reduce_scatter": "all",
    "gc_collect_freq": 1000,
    "gpu_check_level": -1,
    "increase_seq": null,
    "instruct": {
        "no_loss_prompt": false,
        "no_loss_truncated": false
    },
    "instruct_data": "",
    "iter_gopher": {
        "buffer_size": 16,
        "max_precompute": 10,
        "n_chars_by_tok": 15,
        "n_seqs_to_concat": 10,
        "num_processes": 1
    },
    "iter_jsonl": {
        "buffer_size": 64,
        "same_data": false
    },
    "iter_multi": {
        "buffer_size": 64,
        "ignore_extra_chunks": true,
        "iterate_chunk_by_chunk": false,
        "max_precompute": 200,
        "multiprocess": true
    },
    "iter_text_airstore": {
        "airstore_seed": 0,
        "buffer_size": 64,
        "dataloader_workers_per_gpu": 16,
        "prefetch_factor": 20,
        "snapshot_every_n_steps": null
    },
    "iter_type": "multi",
    "keep_eval_checkpoints": false,
    "keep_n_last_checkpoints": 2,
    "load_optimizer_on_finetuning": false,
    "log": {
        "log_scalars": false,
        "log_scalars_to_ods": false,
        "reduce_scalars": false
    },
    "log_all_steps": false,
    "log_freq": 10,
    "log_updates": true,
    "loss_logging_freq": 10,
    "loss_rescaling": false,
    "mb_recompute_attn": false,
    "mb_recompute_fc1_fc3": false,
    "mem_snapshot_max_entries": 100000,
    "mem_snapshot_stop_step": 3,
    "model": {
        "alpha_depth": "disabled",
        "alpha_lrm": 1.0,
        "alpha_separate": false,
        "alpha_wdm": 1.0,
        "attn_bias_type": "block_causal",
        "attn_to_keep": "all",
        "cp_attn_save_global_kv": true,
        "custom_bwd": false,
        "custom_bwd_sum_first_then_comms": true,
        "default_init_router": false,
        "dim": 2048,
        "dropout": 0.0,
        "efficient_attn": "auto",
        "eos_id": 128001,
        "experts_choice_moe": {
            "auto_scale_F": true,
            "capacity_factor": 1.0,
            "eval_threshold_std_mult": 0.0,
            "eval_with_saved_stats": false,
            "eval_with_top_k": false,
            "input_scaling": true,
            "input_scaling_max_clamp": 2.0,
            "input_scaling_min_clamp": -2.0,
            "is_enabled": false,
            "num_experts": 8,
            "running_stats_ema": 0.95,
            "running_stats_sync_freq": 100,
            "use_shared_expert": true
        },
        "ffn_dim_multiplier": 1.5,
        "ffn_exp": 4.0,
        "fp8_amax_compute_algo": "max",
        "fp8_amax_history_len": 1024,
        "fp8_grad_output_dynamic_scale": false,
        "fp8_input_dynamic_scale": false,
        "fp8_interval": 1,
        "fp8_margin": 0,
        "fp8_wgrad": false,
        "fsdp_checkpoint_wrap_layer_frequency": 1,
        "fuse_sequence_parallel": false,
        "high_freq_factor": 32,
        "init": {
            "coeff_std": null,
            "depth_last": false,
            "fixed_std": null,
            "no_init": false,
            "use_depth": "current",
            "use_gaussian": true
        },
        "layer_ckpt": "all",
        "less_layer_first_pp_stage": 0,
        "less_layer_last_pp_stage": 0,
        "linear_residual_dropout": true,
        "local_attention_window_len": null,
        "log_ffn_stats": false,
        "loss_parallel": false,
        "max_length": 2048,
        "multiple_of": 256,
        "n_heads": 32,
        "n_kv_heads": 8,
        "n_layers": 16,
        "non_linearity": "swiglu",
        "norm_affine": true,
        "norm_eps": 1e-05,
        "norm_type": "rmsnorm",
        "number_of_experts": 1,
        "output_size": 128256,
        "pre_norm": true,
        "recompute_attn": false,
        "recompute_fc1_out": true,
        "recompute_fc3_out": true,
        "residual_dropout": 0.1,
        "rope_attn_scale": false,
        "rope_scale_factor": 8,
        "rope_theta": 500000.0,
        "rope_use_fp32_in_outer_product": true,
        "sequence_parallel": true,
        "share_emb": true,
        "use_fp8": false,
        "use_rope": true,
        "use_scaled_rope": true,
        "use_te_layers": false,
        "vocab_parallel": true,
        "vocab_size": 128256
    },
    "model_parallel_size": 1,
    "no_final_ckpt": false,
    "num_layers_per_virtual_pipeline_stage": null,
    "num_microbatches_with_partial_activation_checkpoints": null,
    "old_mp": -1,
    "old_world_size": -1,
    "optim": {
        "annealing_step": 10000,
        "beta1": 0.9,
        "beta2": 0.95,
        "clip": 1.0,
        "cosine_theta": 1.0,
        "cycle_length": 1.0,
        "decay_length_fraction": 0.1,
        "epsilon": 1e-08,
        "exp_factor": 0.5,
        "independent_weight_decay": false,
        "lr": 2e-05,
        "lr_min_ratio": 0.1,
        "scheduler": "cosine",
        "start_annealing_step": -1,
        "use_fp32_copy_optim": true,
        "warmup": 2000,
        "weight_decay": 0.1
    },
    "optimize_backward_concat": false,
    "overlap_p2p_communication": false,
    "periodic_gpu_check": false,
    "pipeline_parallel_microbatch_size": 1,
    "pipeline_parallel_size": 1,
    "pipeline_strategy": "dfs",
    "profile_freq": -1,
    "profile_record_shapes": false,
    "profile_with_stack": false,
    "py_spy_args": {
        "active_seconds": 600,
        "format": "flamegraph",
        "freq": -1,
        "rank0_only": true,
        "rate": 50,
        "start_offset": 10
    },
    "reshard_after_forward": true,
    "restore_dataloader_position": false,
    "root_dump_dir": "/fsx-atom/melhoushi/xldumps/xldumps",
    "runtime_nccl_timeout_s": 180,
    "seq_len": 8192,
    "skip_evals_during_training": false,
    "slurm": {
        "global_rank": 0,
        "is_slurm_job": true,
        "world_size": 128
    },
    "steps": 100000,
    "tokenizer": {
        "basic_special_tokens": [
            "<|begin_of_text|>",
            "<|end_of_text|>",
            "<|fim_prefix|>",
            "<|fim_middle|>",
            "<|fim_suffix|>"
        ],
        "directory": "/fsx-onellm/myasu/models/llama3/Meta-Llama-3-8B/original_fsdp/",
        "extra_special_tokens": [],
        "model": "cl_toplang_128k",
        "num_reserved_special_tokens": 256,
        "pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
        "tokenizer_cls": "TiktokenTokenizer"
    },
    "tokenizer_dir": "",
    "torch_seed": 0,
    "unlimited_steps": false,
    "use_checkpointing_process": false,
    "use_sum_loss": false,
    "valid": {
        "batch_size": 4,
        "debug": false,
        "majority_voting": 0,
        "n_batches": 100,
        "ppl_files_str": "/fsx-labs/broz/data/shuffled/stackv1/stack_v1_2023_02_27_test/stack_test_file.jsonl,/fsx-labs/broz/datasets/wikipedia/processed/en/wikipedia_en_all_maxi_2023-02.jsonl,/fsx-labs/broz/datasets/gutenberg/processed/selected_books.jsonl",
        "prompt_path": "",
        "random_fewshots": false,
        "seed": 42,
        "seq_len": 2048,
        "skip_sanity_check": false,
        "tasks_root_dir": "/fsx-labs/broz/data/large_experiments/theorem/datasets/eval",
        "tasks_str": "human_eval,piqa,siqa,hellaswag",
        "temperature": 0.0,
        "top_k": 0,
        "top_p": 0.0,
        "use_sampling": false,
        "write_eval": false
    },
    "z_loss_multiplier": 0.0
}