commit 66b3dd1ddbb4cb71e2aa0345a3cd39e1136c7a2d Author: ModelHub XC Date: Tue May 19 07:32:30 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: JunHowie/Qwen3-8B-Instruct-2512-SFT Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..437f1fb --- /dev/null +++ b/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1af1c7a --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +--- +library_name: transformers +license: apache-2.0 +license_link: https://huggingface.co/Qwen/Qwen3-8B/blob/main/LICENSE +pipeline_tag: text-generation +base_model: +- Qwen/Qwen3-8B +--- +# Qwen3-8B-Instruct-2512-SFT + + +**NOTE:This model is the Instruct-aligned variant, and it will not generate ```` blocks in its outputs. +Additionally, there is no need to specify enable_thinking=False anymore.** + + + +Among them, the 8B and 14B SFT and DFT variants are obtained via full-parameter fine-tuning, while the 32B models are trained using LoRA due to hardware resource constraints.
+The dataset used is the Chinese Distillation Dataset based on Qwen3-235B-2507.available at:[**Chinese-Qwen3-235B-2507-Distill-data-110k**](https://www.modelscope.cn/datasets/swift/Chinese-Qwen3-235B-2507-Distill-data-110k) +
+For details and code regarding model training and quantization, please see[Training and Quantization Guide](https://www.modelscope.cn/learn/3000) +
+Here is the list of models released in this version:
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Model4-bit AWQ8-bit FP8GPTQNVIDIA FP4Weight-Activation
AWQAWQ-asymINT4INT8NVFP4NVFP4-A16W4A16W8A8
Qwen3-8B-Instruct-2512-DFTAWQawq-asymFP8GPTQ(int4)GPTQ(int8)NVFP4NVFP4A16W4A16W8A8
Qwen3-8B-Instruct-2512-SFTAWQawq-asymFP8GPTQ(int4)GPTQ(int8)NVFP4NVFP4A16W4A16W8A8
Qwen3-14B-Instruct-2512-DFTAWQawq-asymFP8GPTQ(int4)GPTQ(int8)NVFP4NVFP4A16W4A16W8A8
Qwen3-14B-Instruct-2512-SFTAWQawq-asymFP8GPTQ(int4)GPTQ(int8)NVFP4NVFP4A16W4A16W8A8
Qwen3-32B-Instruct-2512-DFTAWQawq-asymFP8GPTQ(int4)GPTQ(int8)NVFP4NVFP4A16W4A16W8A8
+ + + +### 【Dependencies】 +``` +vllm>=0.10.2 +transformers>=4.56.1 +``` + + + diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/args.json b/args.json new file mode 100644 index 0000000..c1b1966 --- /dev/null +++ b/args.json @@ -0,0 +1,390 @@ +{ + "output_dir": "/root/data/output/8B-SFT-Zero3/v0-20251202-213653", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 1e-05, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 1.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.0, + "warmup_steps": 250, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/root/data/output/8B-SFT-Zero3/v0-20251202-213653/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 10, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 200.0, + "save_total_limit": 5, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": true, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "seed": 42, + "data_seed": 42, + "jit_mode_eval": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 200.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": "/root/data/output/8B-SFT-Zero3/v0-20251202-213653", + "disable_tqdm": null, + "remove_unused_columns": true, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": null, + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "parallelism_config": null, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": false, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "zero_quantized_weights": false, + "zero_quantized_gradients": false, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch_fused", + "optim_args": null, + "adafactor": false, + "group_by_length": true, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "project": "huggingface", + "trackio_space_id": "trackio", + "ddp_find_unused_parameters": false, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_token": null, + "hub_private_repo": null, + "hub_always_push": false, + "hub_revision": null, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 18000000, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": true, + "liger_kernel_config": null, + "eval_use_gather_object": false, + "average_tokens_across_devices": true, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "tuner_backend": "peft", + "vit_gradient_checkpointing": null, + "router_aux_loss_coef": 0.0, + "enable_dft_loss": false, + "enable_channel_loss": false, + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "max_epochs": null, + "aligner_lr": null, + "vit_lr": null, + "use_logits_to_keep": null, + "ds3_gather_for_generation": true, + "resume_only_model": false, + "optimizer": null, + "loss_type": null, + "metric": null, + "eval_use_evalscope": false, + "eval_dataset": [], + "eval_dataset_args": null, + "eval_limit": null, + "eval_generation_config": null, + "extra_eval_args": null, + "use_flash_ckpt": false, + "use_ray": false, + "ray_exp_name": null, + "device_groups": null, + "model": "/share/new_models/qwen3/Qwen3-8B", + "model_type": "qwen3_nothinking", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "new_special_tokens": [], + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "max_model_len": null, + "local_repo_path": null, + "init_strategy": null, + "template": "qwen3_nothinking", + "system": null, + "max_length": 1536, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "use_chat_template": true, + "padding_free": false, + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "response_prefix": null, + "template_backend": "swift", + "dataset": [ + "/root/data/datasets/qwen3_235b_2507_distill_110k.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.02, + "dataset_num_proc": 1, + "load_from_cache_file": true, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + "Qwen3-8B-SFT" + ], + "model_author": [ + "JunHowie" + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "lora_modules": [], + "train_type": "full", + "adapters": [], + "external_plugins": [], + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "packing": false, + "packing_length": null, + "lazy_tokenize": false, + "cached_dataset": [], + "custom_register_path": [], + "use_hf": false, + "ignore_args_error": false, + "use_swift_lora": false, + "freeze_parameters": [], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "trainable_parameters_regex": null, + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "target_parameters": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_lark_webhook_url": null, + "swanlab_lark_secret": null, + "swanlab_mode": "cloud", + "add_version": true, + "create_checkpoint_symlink": false, + "zero_hpz_partition_size": null, + "deepspeed_autotp_size": null, + "early_stop_interval": null, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Qwen3-8B", + "model_info": "ModelInfo(model_type='qwen3_nothinking', model_dir='/share/new_models/qwen3/Qwen3-8B', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='qwen3_nothinking', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507', hf_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507', hf_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-235B-A22B-Instruct-2507-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-4B-Instruct-2507', hf_model_id='Qwen/Qwen3-4B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-4B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3_nothinking', get_function=, model_arch=None, architectures=['Qwen3MoeForCausalLM', 'Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])", + "model_dir": "/share/new_models/qwen3/Qwen3-8B", + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/root/data/output/8B-SFT-Zero3/v0-20251202-213653', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.0, warmup_steps=250, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/root/data/output/8B-SFT-Zero3/v0-20251202-213653/runs', logging_strategy=, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=, save_steps=200, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=200, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/root/data/output/8B-SFT-Zero3/v0-20251202-213653', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=True, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=False, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=True, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='full', local_repo_path=None, galore_config=None, padding_side='right', padding_free=False, task_type='causal_lm', problem_type=None)" +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..699ff8d --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..18a4a4d --- /dev/null +++ b/config.json @@ -0,0 +1,68 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..9e289a1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "4.57.1" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..d69724f --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1faf34053d1c4b2bbcfb058443aa1dd3b0cda6be16bf310f962ecd485095f2 +size 4902257696 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..4aecff5 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c01af1ef673ae030502e69abda645b70164043787c1db0c2d6088faab75870a +size 4915960368 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..a8ea04f --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:651792c03e8b92a608db7f189349acf0885a831028382f9138fe75be0531845e +size 4983068496 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..03a5962 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793cdf85218ef2372ab19cd73f18e5eef726dc432b46deaf3f4a0ac4205772f0 +size 1580230264 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..ba886c0 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,407 @@ +{ + "metadata": { + "total_parameters": 308224, + "total_size": 16381470720 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddaf698 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,239 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..cd92f18 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4511 @@ +{ + "best_global_step": 5800, + "best_metric": 0.63199037, + "best_model_checkpoint": "/root/data/output/8B-SFT-Zero3/v0-20251202-213653/checkpoint-5800", + "epoch": 0.9633911368015414, + "eval_steps": 200, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016056518946692356, + "grad_norm": 5.045594701496255, + "learning_rate": 4e-08, + "loss": 0.9851051568984985, + "step": 1 + }, + { + "epoch": 0.0016056518946692357, + "grad_norm": 5.125985791135598, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.98167970445421, + "step": 10 + }, + { + "epoch": 0.0032113037893384713, + "grad_norm": 4.641912421373809, + "learning_rate": 8.000000000000001e-07, + "loss": 0.9206424713134765, + "step": 20 + }, + { + "epoch": 0.004816955684007707, + "grad_norm": 2.7941687987616666, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8770656585693359, + "step": 30 + }, + { + "epoch": 0.006422607578676943, + "grad_norm": 2.353368758760998, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.9011880874633789, + "step": 40 + }, + { + "epoch": 0.00802825947334618, + "grad_norm": 2.239598951932816, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7988590240478516, + "step": 50 + }, + { + "epoch": 0.009633911368015413, + "grad_norm": 2.674482927730846, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.8681858062744141, + "step": 60 + }, + { + "epoch": 0.01123956326268465, + "grad_norm": 1.998306304918899, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7784456729888916, + "step": 70 + }, + { + "epoch": 0.012845215157353885, + "grad_norm": 1.839932484836926, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7746688842773437, + "step": 80 + }, + { + "epoch": 0.014450867052023121, + "grad_norm": 2.321512760194257, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7785279273986816, + "step": 90 + }, + { + "epoch": 0.01605651894669236, + "grad_norm": 2.179078311231977, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7911964893341065, + "step": 100 + }, + { + "epoch": 0.017662170841361593, + "grad_norm": 2.1384126935619725, + "learning_rate": 4.4e-06, + "loss": 0.6902450561523438, + "step": 110 + }, + { + "epoch": 0.019267822736030827, + "grad_norm": 2.5664033239134767, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7887882709503173, + "step": 120 + }, + { + "epoch": 0.020873474630700065, + "grad_norm": 1.8552462640900655, + "learning_rate": 5.2e-06, + "loss": 0.729155158996582, + "step": 130 + }, + { + "epoch": 0.0224791265253693, + "grad_norm": 2.125884253152752, + "learning_rate": 5.600000000000001e-06, + "loss": 0.7969522953033448, + "step": 140 + }, + { + "epoch": 0.024084778420038536, + "grad_norm": 2.1733323665476583, + "learning_rate": 6e-06, + "loss": 0.7695069313049316, + "step": 150 + }, + { + "epoch": 0.02569043031470777, + "grad_norm": 1.9452374095451752, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.7883370399475098, + "step": 160 + }, + { + "epoch": 0.027296082209377008, + "grad_norm": 2.049305323206828, + "learning_rate": 6.800000000000001e-06, + "loss": 0.6908586502075196, + "step": 170 + }, + { + "epoch": 0.028901734104046242, + "grad_norm": 1.9905937382445111, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.7423882484436035, + "step": 180 + }, + { + "epoch": 0.03050738599871548, + "grad_norm": 2.067670072267235, + "learning_rate": 7.600000000000001e-06, + "loss": 0.7752325057983398, + "step": 190 + }, + { + "epoch": 0.03211303789338472, + "grad_norm": 1.7901830862145491, + "learning_rate": 8.000000000000001e-06, + "loss": 0.753516435623169, + "step": 200 + }, + { + "epoch": 0.03211303789338472, + "eval_loss": 0.7076867818832397, + "eval_runtime": 99.7317, + "eval_samples_per_second": 20.284, + "eval_steps_per_second": 5.074, + "eval_token_acc": 0.7709792922559945, + "step": 200 + }, + { + "epoch": 0.03371868978805395, + "grad_norm": 2.1021202279756883, + "learning_rate": 8.400000000000001e-06, + "loss": 0.7589893817901612, + "step": 210 + }, + { + "epoch": 0.035324341682723186, + "grad_norm": 1.8535429586983847, + "learning_rate": 8.8e-06, + "loss": 0.7508197784423828, + "step": 220 + }, + { + "epoch": 0.03692999357739242, + "grad_norm": 2.0946882996421876, + "learning_rate": 9.200000000000002e-06, + "loss": 0.7733804225921631, + "step": 230 + }, + { + "epoch": 0.038535645472061654, + "grad_norm": 1.8769331743438782, + "learning_rate": 9.600000000000001e-06, + "loss": 0.8231748580932617, + "step": 240 + }, + { + "epoch": 0.040141297366730895, + "grad_norm": 1.9051022771627888, + "learning_rate": 1e-05, + "loss": 0.8040414810180664, + "step": 250 + }, + { + "epoch": 0.04174694926140013, + "grad_norm": 1.8256531327814847, + "learning_rate": 9.99993095584273e-06, + "loss": 0.7720019817352295, + "step": 260 + }, + { + "epoch": 0.04335260115606936, + "grad_norm": 1.7654139413134322, + "learning_rate": 9.999723825277754e-06, + "loss": 0.7307815074920654, + "step": 270 + }, + { + "epoch": 0.0449582530507386, + "grad_norm": 2.1357010540003203, + "learning_rate": 9.999378614025538e-06, + "loss": 0.7429679870605469, + "step": 280 + }, + { + "epoch": 0.04656390494540784, + "grad_norm": 2.3921459236651454, + "learning_rate": 9.998895331620009e-06, + "loss": 0.7576918601989746, + "step": 290 + }, + { + "epoch": 0.04816955684007707, + "grad_norm": 1.8796286801473356, + "learning_rate": 9.998273991408293e-06, + "loss": 0.7387456893920898, + "step": 300 + }, + { + "epoch": 0.04977520873474631, + "grad_norm": 1.9563558062770663, + "learning_rate": 9.997514610550363e-06, + "loss": 0.841197395324707, + "step": 310 + }, + { + "epoch": 0.05138086062941554, + "grad_norm": 1.9499020656946842, + "learning_rate": 9.996617210018536e-06, + "loss": 0.7673286437988281, + "step": 320 + }, + { + "epoch": 0.052986512524084775, + "grad_norm": 2.0448234673688934, + "learning_rate": 9.995581814596923e-06, + "loss": 0.8508127212524415, + "step": 330 + }, + { + "epoch": 0.054592164418754016, + "grad_norm": 1.8756180216303968, + "learning_rate": 9.99440845288072e-06, + "loss": 0.7628500938415528, + "step": 340 + }, + { + "epoch": 0.05619781631342325, + "grad_norm": 2.041528608451536, + "learning_rate": 9.99309715727544e-06, + "loss": 0.7440057754516601, + "step": 350 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 1.8006559726861797, + "learning_rate": 9.991647963996001e-06, + "loss": 0.7254949569702148, + "step": 360 + }, + { + "epoch": 0.05940912010276172, + "grad_norm": 1.7855335690115415, + "learning_rate": 9.990060913065735e-06, + "loss": 0.7758376121520996, + "step": 370 + }, + { + "epoch": 0.06101477199743096, + "grad_norm": 2.0412805758933388, + "learning_rate": 9.988336048315278e-06, + "loss": 0.7547982215881348, + "step": 380 + }, + { + "epoch": 0.0626204238921002, + "grad_norm": 1.8153150010996724, + "learning_rate": 9.986473417381366e-06, + "loss": 0.75426025390625, + "step": 390 + }, + { + "epoch": 0.06422607578676943, + "grad_norm": 1.8236902944999438, + "learning_rate": 9.984473071705512e-06, + "loss": 0.7518160343170166, + "step": 400 + }, + { + "epoch": 0.06422607578676943, + "eval_loss": 0.7055317759513855, + "eval_runtime": 101.0735, + "eval_samples_per_second": 20.015, + "eval_steps_per_second": 5.006, + "eval_token_acc": 0.7710310157322793, + "step": 400 + }, + { + "epoch": 0.06583172768143866, + "grad_norm": 1.981984502463804, + "learning_rate": 9.982335066532586e-06, + "loss": 0.724842882156372, + "step": 410 + }, + { + "epoch": 0.0674373795761079, + "grad_norm": 2.00462113974283, + "learning_rate": 9.980059460909298e-06, + "loss": 0.7751227378845215, + "step": 420 + }, + { + "epoch": 0.06904303147077713, + "grad_norm": 1.9164446754319746, + "learning_rate": 9.977646317682553e-06, + "loss": 0.7310826301574707, + "step": 430 + }, + { + "epoch": 0.07064868336544637, + "grad_norm": 1.425780494666241, + "learning_rate": 9.975095703497727e-06, + "loss": 0.7348933696746827, + "step": 440 + }, + { + "epoch": 0.07225433526011561, + "grad_norm": 1.7264282435567964, + "learning_rate": 9.972407688796827e-06, + "loss": 0.7918214321136474, + "step": 450 + }, + { + "epoch": 0.07385998715478484, + "grad_norm": 1.7873690443121253, + "learning_rate": 9.969582347816534e-06, + "loss": 0.7410657405853271, + "step": 460 + }, + { + "epoch": 0.07546563904945408, + "grad_norm": 1.916709071222249, + "learning_rate": 9.966619758586164e-06, + "loss": 0.7694530487060547, + "step": 470 + }, + { + "epoch": 0.07707129094412331, + "grad_norm": 1.7832878680515183, + "learning_rate": 9.963520002925506e-06, + "loss": 0.747270917892456, + "step": 480 + }, + { + "epoch": 0.07867694283879255, + "grad_norm": 1.999021678268013, + "learning_rate": 9.960283166442569e-06, + "loss": 0.7553834438323974, + "step": 490 + }, + { + "epoch": 0.08028259473346179, + "grad_norm": 1.9891718192525643, + "learning_rate": 9.956909338531211e-06, + "loss": 0.7506345748901367, + "step": 500 + }, + { + "epoch": 0.08188824662813102, + "grad_norm": 2.0785214210367076, + "learning_rate": 9.953398612368673e-06, + "loss": 0.7921933174133301, + "step": 510 + }, + { + "epoch": 0.08349389852280026, + "grad_norm": 1.7387864788088594, + "learning_rate": 9.949751084913008e-06, + "loss": 0.8228381156921387, + "step": 520 + }, + { + "epoch": 0.0850995504174695, + "grad_norm": 1.6647874896219872, + "learning_rate": 9.9459668569004e-06, + "loss": 0.7765993118286133, + "step": 530 + }, + { + "epoch": 0.08670520231213873, + "grad_norm": 1.783168935050446, + "learning_rate": 9.942046032842381e-06, + "loss": 0.7280902862548828, + "step": 540 + }, + { + "epoch": 0.08831085420680797, + "grad_norm": 1.7261682650429326, + "learning_rate": 9.937988721022948e-06, + "loss": 0.7468975067138672, + "step": 550 + }, + { + "epoch": 0.0899165061014772, + "grad_norm": 1.98380263385464, + "learning_rate": 9.933795033495575e-06, + "loss": 0.7684798240661621, + "step": 560 + }, + { + "epoch": 0.09152215799614644, + "grad_norm": 2.0917455900276916, + "learning_rate": 9.929465086080106e-06, + "loss": 0.7396535396575927, + "step": 570 + }, + { + "epoch": 0.09312780989081568, + "grad_norm": 2.040496130596098, + "learning_rate": 9.924998998359571e-06, + "loss": 0.7308344841003418, + "step": 580 + }, + { + "epoch": 0.0947334617854849, + "grad_norm": 1.5962826247391504, + "learning_rate": 9.920396893676875e-06, + "loss": 0.7247919082641602, + "step": 590 + }, + { + "epoch": 0.09633911368015415, + "grad_norm": 2.287295954105862, + "learning_rate": 9.915658899131393e-06, + "loss": 0.7654624938964844, + "step": 600 + }, + { + "epoch": 0.09633911368015415, + "eval_loss": 0.6988159418106079, + "eval_runtime": 100.6058, + "eval_samples_per_second": 20.108, + "eval_steps_per_second": 5.03, + "eval_token_acc": 0.7711052639482365, + "step": 600 + }, + { + "epoch": 0.09794476557482337, + "grad_norm": 1.863642076813837, + "learning_rate": 9.910785145575464e-06, + "loss": 0.7458448886871338, + "step": 610 + }, + { + "epoch": 0.09955041746949261, + "grad_norm": 1.8059006554878292, + "learning_rate": 9.905775767610767e-06, + "loss": 0.7553767204284668, + "step": 620 + }, + { + "epoch": 0.10115606936416185, + "grad_norm": 1.8021678660460414, + "learning_rate": 9.900630903584616e-06, + "loss": 0.8086840629577636, + "step": 630 + }, + { + "epoch": 0.10276172125883108, + "grad_norm": 1.86266598368208, + "learning_rate": 9.895350695586133e-06, + "loss": 0.7645243644714356, + "step": 640 + }, + { + "epoch": 0.10436737315350032, + "grad_norm": 1.9757822930978346, + "learning_rate": 9.889935289442318e-06, + "loss": 0.7408963203430176, + "step": 650 + }, + { + "epoch": 0.10597302504816955, + "grad_norm": 2.056921894533938, + "learning_rate": 9.884384834714038e-06, + "loss": 0.7500060081481934, + "step": 660 + }, + { + "epoch": 0.10757867694283879, + "grad_norm": 1.6383333188595506, + "learning_rate": 9.878699484691876e-06, + "loss": 0.6806066989898681, + "step": 670 + }, + { + "epoch": 0.10918432883750803, + "grad_norm": 2.0688309441664456, + "learning_rate": 9.872879396391915e-06, + "loss": 0.7850825309753418, + "step": 680 + }, + { + "epoch": 0.11078998073217726, + "grad_norm": 2.0032148038902897, + "learning_rate": 9.866924730551391e-06, + "loss": 0.737500810623169, + "step": 690 + }, + { + "epoch": 0.1123956326268465, + "grad_norm": 2.0476297529312664, + "learning_rate": 9.860835651624259e-06, + "loss": 0.7644984245300293, + "step": 700 + }, + { + "epoch": 0.11400128452151574, + "grad_norm": 1.9631657998239447, + "learning_rate": 9.854612327776644e-06, + "loss": 0.7839177131652832, + "step": 710 + }, + { + "epoch": 0.11560693641618497, + "grad_norm": 1.6570213251759018, + "learning_rate": 9.848254930882214e-06, + "loss": 0.753380823135376, + "step": 720 + }, + { + "epoch": 0.11721258831085421, + "grad_norm": 2.164985354398442, + "learning_rate": 9.841763636517406e-06, + "loss": 0.8096098899841309, + "step": 730 + }, + { + "epoch": 0.11881824020552344, + "grad_norm": 1.7563983933390181, + "learning_rate": 9.835138623956603e-06, + "loss": 0.7301701068878174, + "step": 740 + }, + { + "epoch": 0.12042389210019268, + "grad_norm": 1.7499502476904967, + "learning_rate": 9.828380076167167e-06, + "loss": 0.7721126079559326, + "step": 750 + }, + { + "epoch": 0.12202954399486192, + "grad_norm": 1.825643887765283, + "learning_rate": 9.821488179804394e-06, + "loss": 0.7608715057373047, + "step": 760 + }, + { + "epoch": 0.12363519588953115, + "grad_norm": 2.0344498074032096, + "learning_rate": 9.814463125206356e-06, + "loss": 0.8143133163452149, + "step": 770 + }, + { + "epoch": 0.1252408477842004, + "grad_norm": 1.8599734868121924, + "learning_rate": 9.80730510638864e-06, + "loss": 0.7370705127716064, + "step": 780 + }, + { + "epoch": 0.12684649967886963, + "grad_norm": 2.401539645903579, + "learning_rate": 9.800014321038998e-06, + "loss": 0.7864031791687012, + "step": 790 + }, + { + "epoch": 0.12845215157353887, + "grad_norm": 1.9156614624885597, + "learning_rate": 9.792590970511882e-06, + "loss": 0.7531948089599609, + "step": 800 + }, + { + "epoch": 0.12845215157353887, + "eval_loss": 0.6976503133773804, + "eval_runtime": 100.9577, + "eval_samples_per_second": 20.038, + "eval_steps_per_second": 5.012, + "eval_token_acc": 0.7718560886039834, + "step": 800 + }, + { + "epoch": 0.13005780346820808, + "grad_norm": 2.0518814451795673, + "learning_rate": 9.785035259822884e-06, + "loss": 0.7535972595214844, + "step": 810 + }, + { + "epoch": 0.13166345536287732, + "grad_norm": 1.8413763571861954, + "learning_rate": 9.777347397643075e-06, + "loss": 0.7913750171661377, + "step": 820 + }, + { + "epoch": 0.13326910725754657, + "grad_norm": 1.8730208265946418, + "learning_rate": 9.769527596293242e-06, + "loss": 0.7386277198791504, + "step": 830 + }, + { + "epoch": 0.1348747591522158, + "grad_norm": 1.659408810434294, + "learning_rate": 9.761576071738023e-06, + "loss": 0.7074491500854492, + "step": 840 + }, + { + "epoch": 0.13648041104688505, + "grad_norm": 1.8873087614580026, + "learning_rate": 9.753493043579942e-06, + "loss": 0.7388912200927734, + "step": 850 + }, + { + "epoch": 0.13808606294155426, + "grad_norm": 1.6722890579957244, + "learning_rate": 9.745278735053345e-06, + "loss": 0.7493368148803711, + "step": 860 + }, + { + "epoch": 0.1396917148362235, + "grad_norm": 1.6772342194345173, + "learning_rate": 9.736933373018236e-06, + "loss": 0.6926981449127197, + "step": 870 + }, + { + "epoch": 0.14129736673089274, + "grad_norm": 1.7437024472242095, + "learning_rate": 9.728457187954013e-06, + "loss": 0.769747257232666, + "step": 880 + }, + { + "epoch": 0.14290301862556198, + "grad_norm": 1.785712412031916, + "learning_rate": 9.719850413953095e-06, + "loss": 0.7651779174804687, + "step": 890 + }, + { + "epoch": 0.14450867052023122, + "grad_norm": 1.7302496278883281, + "learning_rate": 9.711113288714466e-06, + "loss": 0.7735454559326171, + "step": 900 + }, + { + "epoch": 0.14611432241490044, + "grad_norm": 1.8272087036997788, + "learning_rate": 9.702246053537108e-06, + "loss": 0.7874812126159668, + "step": 910 + }, + { + "epoch": 0.14771997430956968, + "grad_norm": 1.804466644550254, + "learning_rate": 9.69324895331333e-06, + "loss": 0.7952823638916016, + "step": 920 + }, + { + "epoch": 0.14932562620423892, + "grad_norm": 1.6692779796502772, + "learning_rate": 9.684122236522014e-06, + "loss": 0.760723876953125, + "step": 930 + }, + { + "epoch": 0.15093127809890816, + "grad_norm": 2.279602144252399, + "learning_rate": 9.674866155221745e-06, + "loss": 0.75501708984375, + "step": 940 + }, + { + "epoch": 0.1525369299935774, + "grad_norm": 2.2519054003919377, + "learning_rate": 9.665480965043862e-06, + "loss": 0.7569370269775391, + "step": 950 + }, + { + "epoch": 0.15414258188824662, + "grad_norm": 1.639554654895838, + "learning_rate": 9.655966925185381e-06, + "loss": 0.7236834526062011, + "step": 960 + }, + { + "epoch": 0.15574823378291586, + "grad_norm": 1.743926286057347, + "learning_rate": 9.646324298401849e-06, + "loss": 0.8129490852355957, + "step": 970 + }, + { + "epoch": 0.1573538856775851, + "grad_norm": 2.053932657043873, + "learning_rate": 9.636553351000077e-06, + "loss": 0.8245757102966309, + "step": 980 + }, + { + "epoch": 0.15895953757225434, + "grad_norm": 1.7670536328368698, + "learning_rate": 9.626654352830801e-06, + "loss": 0.7364852905273438, + "step": 990 + }, + { + "epoch": 0.16056518946692358, + "grad_norm": 1.997570888195663, + "learning_rate": 9.616627577281217e-06, + "loss": 0.8088130950927734, + "step": 1000 + }, + { + "epoch": 0.16056518946692358, + "eval_loss": 0.6925203204154968, + "eval_runtime": 100.0123, + "eval_samples_per_second": 20.228, + "eval_steps_per_second": 5.059, + "eval_token_acc": 0.7732542909629075, + "step": 1000 + }, + { + "epoch": 0.1621708413615928, + "grad_norm": 2.303872077571802, + "learning_rate": 9.606473301267427e-06, + "loss": 0.7984984397888184, + "step": 1010 + }, + { + "epoch": 0.16377649325626203, + "grad_norm": 1.7949507187260336, + "learning_rate": 9.59619180522681e-06, + "loss": 0.7323333740234375, + "step": 1020 + }, + { + "epoch": 0.16538214515093128, + "grad_norm": 1.9305044307817225, + "learning_rate": 9.585783373110248e-06, + "loss": 0.767444372177124, + "step": 1030 + }, + { + "epoch": 0.16698779704560052, + "grad_norm": 1.940504218877435, + "learning_rate": 9.575248292374322e-06, + "loss": 0.7465566635131836, + "step": 1040 + }, + { + "epoch": 0.16859344894026976, + "grad_norm": 1.974154937597088, + "learning_rate": 9.564586853973332e-06, + "loss": 0.8003168106079102, + "step": 1050 + }, + { + "epoch": 0.170199100834939, + "grad_norm": 1.8080878076418148, + "learning_rate": 9.553799352351293e-06, + "loss": 0.7629599094390869, + "step": 1060 + }, + { + "epoch": 0.1718047527296082, + "grad_norm": 1.870487361356404, + "learning_rate": 9.54288608543379e-06, + "loss": 0.7689233779907226, + "step": 1070 + }, + { + "epoch": 0.17341040462427745, + "grad_norm": 1.8275349692113991, + "learning_rate": 9.531847354619745e-06, + "loss": 0.704521894454956, + "step": 1080 + }, + { + "epoch": 0.1750160565189467, + "grad_norm": 1.7015288397617692, + "learning_rate": 9.52068346477311e-06, + "loss": 0.7523484230041504, + "step": 1090 + }, + { + "epoch": 0.17662170841361594, + "grad_norm": 1.7290639256028824, + "learning_rate": 9.509394724214428e-06, + "loss": 0.7474178791046142, + "step": 1100 + }, + { + "epoch": 0.17822736030828518, + "grad_norm": 1.8497366081907418, + "learning_rate": 9.497981444712332e-06, + "loss": 0.7826375007629395, + "step": 1110 + }, + { + "epoch": 0.1798330122029544, + "grad_norm": 1.8815163506102677, + "learning_rate": 9.486443941474928e-06, + "loss": 0.7807172775268555, + "step": 1120 + }, + { + "epoch": 0.18143866409762363, + "grad_norm": 2.001875209196253, + "learning_rate": 9.47478253314109e-06, + "loss": 0.7732196807861328, + "step": 1130 + }, + { + "epoch": 0.18304431599229287, + "grad_norm": 1.7043428423094649, + "learning_rate": 9.462997541771664e-06, + "loss": 0.7617592811584473, + "step": 1140 + }, + { + "epoch": 0.1846499678869621, + "grad_norm": 1.7130988725260645, + "learning_rate": 9.451089292840569e-06, + "loss": 0.7022255420684814, + "step": 1150 + }, + { + "epoch": 0.18625561978163135, + "grad_norm": 1.7793067950964665, + "learning_rate": 9.439058115225808e-06, + "loss": 0.767824363708496, + "step": 1160 + }, + { + "epoch": 0.18786127167630057, + "grad_norm": 1.8447543701250206, + "learning_rate": 9.42690434120039e-06, + "loss": 0.7075161933898926, + "step": 1170 + }, + { + "epoch": 0.1894669235709698, + "grad_norm": 2.1540810301213305, + "learning_rate": 9.414628306423148e-06, + "loss": 0.7819767475128174, + "step": 1180 + }, + { + "epoch": 0.19107257546563905, + "grad_norm": 1.5815294669707292, + "learning_rate": 9.402230349929475e-06, + "loss": 0.7533804893493652, + "step": 1190 + }, + { + "epoch": 0.1926782273603083, + "grad_norm": 1.7999543693842177, + "learning_rate": 9.389710814121951e-06, + "loss": 0.7735315322875976, + "step": 1200 + }, + { + "epoch": 0.1926782273603083, + "eval_loss": 0.6868460774421692, + "eval_runtime": 100.3485, + "eval_samples_per_second": 20.16, + "eval_steps_per_second": 5.042, + "eval_token_acc": 0.77447312965407, + "step": 1200 + }, + { + "epoch": 0.19428387925497753, + "grad_norm": 1.9775051957951533, + "learning_rate": 9.377070044760899e-06, + "loss": 0.7410672187805176, + "step": 1210 + }, + { + "epoch": 0.19588953114964675, + "grad_norm": 1.8830123288540028, + "learning_rate": 9.364308390954823e-06, + "loss": 0.7257200241088867, + "step": 1220 + }, + { + "epoch": 0.197495183044316, + "grad_norm": 1.923119464740565, + "learning_rate": 9.351426205150778e-06, + "loss": 0.7510544300079346, + "step": 1230 + }, + { + "epoch": 0.19910083493898523, + "grad_norm": 1.884018657813029, + "learning_rate": 9.338423843124627e-06, + "loss": 0.7750972747802735, + "step": 1240 + }, + { + "epoch": 0.20070648683365447, + "grad_norm": 1.6459258264733365, + "learning_rate": 9.325301663971222e-06, + "loss": 0.7792768478393555, + "step": 1250 + }, + { + "epoch": 0.2023121387283237, + "grad_norm": 2.3093451268783265, + "learning_rate": 9.312060030094487e-06, + "loss": 0.7784916400909424, + "step": 1260 + }, + { + "epoch": 0.20391779062299292, + "grad_norm": 2.017607836003892, + "learning_rate": 9.298699307197398e-06, + "loss": 0.707589054107666, + "step": 1270 + }, + { + "epoch": 0.20552344251766216, + "grad_norm": 2.16235384945215, + "learning_rate": 9.2852198642719e-06, + "loss": 0.6854191780090332, + "step": 1280 + }, + { + "epoch": 0.2071290944123314, + "grad_norm": 1.7496835798170465, + "learning_rate": 9.271622073588699e-06, + "loss": 0.7528693675994873, + "step": 1290 + }, + { + "epoch": 0.20873474630700065, + "grad_norm": 1.6805965965166436, + "learning_rate": 9.257906310686999e-06, + "loss": 0.7310030937194825, + "step": 1300 + }, + { + "epoch": 0.2103403982016699, + "grad_norm": 1.7240696394841069, + "learning_rate": 9.244072954364116e-06, + "loss": 0.7570672035217285, + "step": 1310 + }, + { + "epoch": 0.2119460500963391, + "grad_norm": 1.696536924694626, + "learning_rate": 9.23012238666502e-06, + "loss": 0.7422749996185303, + "step": 1320 + }, + { + "epoch": 0.21355170199100834, + "grad_norm": 1.8201920770430506, + "learning_rate": 9.216054992871787e-06, + "loss": 0.7625796318054199, + "step": 1330 + }, + { + "epoch": 0.21515735388567758, + "grad_norm": 1.7512686774637143, + "learning_rate": 9.201871161492957e-06, + "loss": 0.7356629848480225, + "step": 1340 + }, + { + "epoch": 0.21676300578034682, + "grad_norm": 1.6301937120337884, + "learning_rate": 9.187571284252806e-06, + "loss": 0.728066349029541, + "step": 1350 + }, + { + "epoch": 0.21836865767501606, + "grad_norm": 1.8986829697456375, + "learning_rate": 9.17315575608052e-06, + "loss": 0.7538561820983887, + "step": 1360 + }, + { + "epoch": 0.2199743095696853, + "grad_norm": 1.9496897623998384, + "learning_rate": 9.158624975099299e-06, + "loss": 0.7733088493347168, + "step": 1370 + }, + { + "epoch": 0.22157996146435452, + "grad_norm": 1.733812640841246, + "learning_rate": 9.143979342615354e-06, + "loss": 0.7938045501708985, + "step": 1380 + }, + { + "epoch": 0.22318561335902376, + "grad_norm": 1.8703147098290802, + "learning_rate": 9.129219263106825e-06, + "loss": 0.7479697227478027, + "step": 1390 + }, + { + "epoch": 0.224791265253693, + "grad_norm": 1.7131842139401694, + "learning_rate": 9.11434514421261e-06, + "loss": 0.7115351676940918, + "step": 1400 + }, + { + "epoch": 0.224791265253693, + "eval_loss": 0.6884077787399292, + "eval_runtime": 101.5323, + "eval_samples_per_second": 19.925, + "eval_steps_per_second": 4.984, + "eval_token_acc": 0.774602438344782, + "step": 1400 + }, + { + "epoch": 0.22639691714836224, + "grad_norm": 1.581417818250803, + "learning_rate": 9.099357396721117e-06, + "loss": 0.6985572814941406, + "step": 1410 + }, + { + "epoch": 0.22800256904303148, + "grad_norm": 2.0581793655158918, + "learning_rate": 9.084256434558898e-06, + "loss": 0.7135414123535156, + "step": 1420 + }, + { + "epoch": 0.2296082209377007, + "grad_norm": 1.895110040798408, + "learning_rate": 9.069042674779238e-06, + "loss": 0.7391388893127442, + "step": 1430 + }, + { + "epoch": 0.23121387283236994, + "grad_norm": 1.8028664147595719, + "learning_rate": 9.053716537550627e-06, + "loss": 0.7372079372406006, + "step": 1440 + }, + { + "epoch": 0.23281952472703918, + "grad_norm": 1.5474434923515745, + "learning_rate": 9.038278446145155e-06, + "loss": 0.8027336120605468, + "step": 1450 + }, + { + "epoch": 0.23442517662170842, + "grad_norm": 1.6608147397642952, + "learning_rate": 9.022728826926825e-06, + "loss": 0.7410070419311523, + "step": 1460 + }, + { + "epoch": 0.23603082851637766, + "grad_norm": 2.0544380178655457, + "learning_rate": 9.007068109339783e-06, + "loss": 0.7138307094573975, + "step": 1470 + }, + { + "epoch": 0.23763648041104687, + "grad_norm": 1.9265320027175405, + "learning_rate": 8.991296725896449e-06, + "loss": 0.777537488937378, + "step": 1480 + }, + { + "epoch": 0.23924213230571612, + "grad_norm": 1.8444266047774585, + "learning_rate": 8.975415112165566e-06, + "loss": 0.7667001724243164, + "step": 1490 + }, + { + "epoch": 0.24084778420038536, + "grad_norm": 1.539393309355395, + "learning_rate": 8.959423706760197e-06, + "loss": 0.7157824039459229, + "step": 1500 + }, + { + "epoch": 0.2424534360950546, + "grad_norm": 1.9224139568424825, + "learning_rate": 8.943322951325583e-06, + "loss": 0.7478031158447266, + "step": 1510 + }, + { + "epoch": 0.24405908798972384, + "grad_norm": 1.739352000191543, + "learning_rate": 8.927113290526961e-06, + "loss": 0.7133482456207275, + "step": 1520 + }, + { + "epoch": 0.24566473988439305, + "grad_norm": 2.0313357914665, + "learning_rate": 8.910795172037278e-06, + "loss": 0.724417781829834, + "step": 1530 + }, + { + "epoch": 0.2472703917790623, + "grad_norm": 1.824835287137536, + "learning_rate": 8.894369046524829e-06, + "loss": 0.702476167678833, + "step": 1540 + }, + { + "epoch": 0.24887604367373153, + "grad_norm": 1.7069567462050508, + "learning_rate": 8.877835367640813e-06, + "loss": 0.7205674648284912, + "step": 1550 + }, + { + "epoch": 0.2504816955684008, + "grad_norm": 1.6643061973982913, + "learning_rate": 8.861194592006798e-06, + "loss": 0.751797866821289, + "step": 1560 + }, + { + "epoch": 0.25208734746307, + "grad_norm": 1.8641371048435398, + "learning_rate": 8.844447179202119e-06, + "loss": 0.7409004688262939, + "step": 1570 + }, + { + "epoch": 0.25369299935773926, + "grad_norm": 1.6197221278271672, + "learning_rate": 8.827593591751172e-06, + "loss": 0.7813194274902344, + "step": 1580 + }, + { + "epoch": 0.25529865125240847, + "grad_norm": 1.8401298655390084, + "learning_rate": 8.810634295110661e-06, + "loss": 0.724795913696289, + "step": 1590 + }, + { + "epoch": 0.25690430314707774, + "grad_norm": 1.7223908388688454, + "learning_rate": 8.793569757656718e-06, + "loss": 0.7601978302001953, + "step": 1600 + }, + { + "epoch": 0.25690430314707774, + "eval_loss": 0.6818346977233887, + "eval_runtime": 100.0754, + "eval_samples_per_second": 20.215, + "eval_steps_per_second": 5.056, + "eval_token_acc": 0.7759555912243614, + "step": 1600 + }, + { + "epoch": 0.25850995504174695, + "grad_norm": 2.051065991460817, + "learning_rate": 8.77640045067199e-06, + "loss": 0.7404222011566162, + "step": 1610 + }, + { + "epoch": 0.26011560693641617, + "grad_norm": 2.0856367857981732, + "learning_rate": 8.759126848332608e-06, + "loss": 0.7499489784240723, + "step": 1620 + }, + { + "epoch": 0.26172125883108543, + "grad_norm": 1.776943450899247, + "learning_rate": 8.7417494276951e-06, + "loss": 0.7666402816772461, + "step": 1630 + }, + { + "epoch": 0.26332691072575465, + "grad_norm": 1.8417418661119802, + "learning_rate": 8.724268668683207e-06, + "loss": 0.7746978282928467, + "step": 1640 + }, + { + "epoch": 0.2649325626204239, + "grad_norm": 1.7767374838825565, + "learning_rate": 8.706685054074644e-06, + "loss": 0.6751203536987305, + "step": 1650 + }, + { + "epoch": 0.26653821451509313, + "grad_norm": 1.8890767025158177, + "learning_rate": 8.688999069487749e-06, + "loss": 0.7576641082763672, + "step": 1660 + }, + { + "epoch": 0.26814386640976234, + "grad_norm": 1.6503160358499191, + "learning_rate": 8.671211203368083e-06, + "loss": 0.7491856575012207, + "step": 1670 + }, + { + "epoch": 0.2697495183044316, + "grad_norm": 1.7619578916338219, + "learning_rate": 8.653321946974939e-06, + "loss": 0.7653478145599365, + "step": 1680 + }, + { + "epoch": 0.2713551701991008, + "grad_norm": 1.6161841088316613, + "learning_rate": 8.635331794367766e-06, + "loss": 0.7169931411743165, + "step": 1690 + }, + { + "epoch": 0.2729608220937701, + "grad_norm": 1.862893489220715, + "learning_rate": 8.617241242392535e-06, + "loss": 0.709630298614502, + "step": 1700 + }, + { + "epoch": 0.2745664739884393, + "grad_norm": 2.0942702006851284, + "learning_rate": 8.599050790668016e-06, + "loss": 0.732765007019043, + "step": 1710 + }, + { + "epoch": 0.2761721258831085, + "grad_norm": 1.909209797536119, + "learning_rate": 8.580760941571968e-06, + "loss": 0.7709503173828125, + "step": 1720 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 1.8776515240203724, + "learning_rate": 8.56237220022728e-06, + "loss": 0.7700500965118409, + "step": 1730 + }, + { + "epoch": 0.279383429672447, + "grad_norm": 2.055122504726026, + "learning_rate": 8.543885074488012e-06, + "loss": 0.7499999523162841, + "step": 1740 + }, + { + "epoch": 0.2809890815671163, + "grad_norm": 1.8264371508475254, + "learning_rate": 8.525300074925371e-06, + "loss": 0.7511765003204346, + "step": 1750 + }, + { + "epoch": 0.2825947334617855, + "grad_norm": 1.9987659171231758, + "learning_rate": 8.50661771481361e-06, + "loss": 0.7588027954101563, + "step": 1760 + }, + { + "epoch": 0.2842003853564547, + "grad_norm": 1.8223934722762116, + "learning_rate": 8.48783851011585e-06, + "loss": 0.7207003116607666, + "step": 1770 + }, + { + "epoch": 0.28580603725112397, + "grad_norm": 1.8043617826293, + "learning_rate": 8.468962979469841e-06, + "loss": 0.7769267082214355, + "step": 1780 + }, + { + "epoch": 0.2874116891457932, + "grad_norm": 1.5780943217315104, + "learning_rate": 8.449991644173624e-06, + "loss": 0.7341270446777344, + "step": 1790 + }, + { + "epoch": 0.28901734104046245, + "grad_norm": 1.7203708079691964, + "learning_rate": 8.43092502817114e-06, + "loss": 0.7272850036621094, + "step": 1800 + }, + { + "epoch": 0.28901734104046245, + "eval_loss": 0.6783972978591919, + "eval_runtime": 100.8991, + "eval_samples_per_second": 20.05, + "eval_steps_per_second": 5.015, + "eval_token_acc": 0.7769258235295099, + "step": 1800 + }, + { + "epoch": 0.29062299293513166, + "grad_norm": 1.889553755438213, + "learning_rate": 8.411763658037764e-06, + "loss": 0.7512543201446533, + "step": 1810 + }, + { + "epoch": 0.2922286448298009, + "grad_norm": 1.7574412886035682, + "learning_rate": 8.392508062965758e-06, + "loss": 0.7729681968688965, + "step": 1820 + }, + { + "epoch": 0.29383429672447015, + "grad_norm": 1.5623367854652537, + "learning_rate": 8.373158774749654e-06, + "loss": 0.7248388767242432, + "step": 1830 + }, + { + "epoch": 0.29543994861913936, + "grad_norm": 1.641798516178298, + "learning_rate": 8.353716327771572e-06, + "loss": 0.7033072948455811, + "step": 1840 + }, + { + "epoch": 0.2970456005138086, + "grad_norm": 1.834462270850742, + "learning_rate": 8.33418125898646e-06, + "loss": 0.7425559997558594, + "step": 1850 + }, + { + "epoch": 0.29865125240847784, + "grad_norm": 1.8200586992341523, + "learning_rate": 8.314554107907262e-06, + "loss": 0.7474506855010986, + "step": 1860 + }, + { + "epoch": 0.30025690430314705, + "grad_norm": 1.9230698549313547, + "learning_rate": 8.294835416590019e-06, + "loss": 0.7160479068756104, + "step": 1870 + }, + { + "epoch": 0.3018625561978163, + "grad_norm": 1.8857497922783588, + "learning_rate": 8.275025729618902e-06, + "loss": 0.7675430297851562, + "step": 1880 + }, + { + "epoch": 0.30346820809248554, + "grad_norm": 1.7590784658550567, + "learning_rate": 8.255125594091169e-06, + "loss": 0.6951826095581055, + "step": 1890 + }, + { + "epoch": 0.3050738599871548, + "grad_norm": 2.070729187194605, + "learning_rate": 8.235135559602055e-06, + "loss": 0.6935332298278809, + "step": 1900 + }, + { + "epoch": 0.306679511881824, + "grad_norm": 1.9490809465048364, + "learning_rate": 8.21505617822959e-06, + "loss": 0.7839535713195801, + "step": 1910 + }, + { + "epoch": 0.30828516377649323, + "grad_norm": 1.9244151039820374, + "learning_rate": 8.194888004519365e-06, + "loss": 0.77308669090271, + "step": 1920 + }, + { + "epoch": 0.3098908156711625, + "grad_norm": 1.7027702709888077, + "learning_rate": 8.1746315954692e-06, + "loss": 0.7465029716491699, + "step": 1930 + }, + { + "epoch": 0.3114964675658317, + "grad_norm": 1.8566176904812088, + "learning_rate": 8.154287510513773e-06, + "loss": 0.7425686836242675, + "step": 1940 + }, + { + "epoch": 0.313102119460501, + "grad_norm": 1.6606391878700422, + "learning_rate": 8.133856311509165e-06, + "loss": 0.738848876953125, + "step": 1950 + }, + { + "epoch": 0.3147077713551702, + "grad_norm": 1.739030610713329, + "learning_rate": 8.113338562717341e-06, + "loss": 0.6945414543151855, + "step": 1960 + }, + { + "epoch": 0.3163134232498394, + "grad_norm": 1.7105926585715472, + "learning_rate": 8.092734830790575e-06, + "loss": 0.6944126129150391, + "step": 1970 + }, + { + "epoch": 0.3179190751445087, + "grad_norm": 2.0738901086385337, + "learning_rate": 8.072045684755783e-06, + "loss": 0.7403692245483399, + "step": 1980 + }, + { + "epoch": 0.3195247270391779, + "grad_norm": 1.8644608268454954, + "learning_rate": 8.051271695998832e-06, + "loss": 0.747812557220459, + "step": 1990 + }, + { + "epoch": 0.32113037893384716, + "grad_norm": 2.0091042407307476, + "learning_rate": 8.03041343824874e-06, + "loss": 0.7711987972259522, + "step": 2000 + }, + { + "epoch": 0.32113037893384716, + "eval_loss": 0.6775335669517517, + "eval_runtime": 99.8279, + "eval_samples_per_second": 20.265, + "eval_steps_per_second": 5.069, + "eval_token_acc": 0.7775732012326872, + "step": 2000 + }, + { + "epoch": 0.3227360308285164, + "grad_norm": 1.7263752186897543, + "learning_rate": 8.009471487561837e-06, + "loss": 0.7122695922851563, + "step": 2010 + }, + { + "epoch": 0.3243416827231856, + "grad_norm": 2.3515395561571615, + "learning_rate": 7.988446422305857e-06, + "loss": 0.7937726020812989, + "step": 2020 + }, + { + "epoch": 0.32594733461785486, + "grad_norm": 1.6016251213715271, + "learning_rate": 7.967338823143967e-06, + "loss": 0.7619183540344239, + "step": 2030 + }, + { + "epoch": 0.32755298651252407, + "grad_norm": 1.9134390634488312, + "learning_rate": 7.946149273018723e-06, + "loss": 0.7309248924255372, + "step": 2040 + }, + { + "epoch": 0.32915863840719334, + "grad_norm": 1.8217084821661345, + "learning_rate": 7.92487835713598e-06, + "loss": 0.7465410232543945, + "step": 2050 + }, + { + "epoch": 0.33076429030186255, + "grad_norm": 1.6321933700829891, + "learning_rate": 7.903526662948721e-06, + "loss": 0.7252727508544922, + "step": 2060 + }, + { + "epoch": 0.33236994219653176, + "grad_norm": 2.062142440094139, + "learning_rate": 7.882094780140838e-06, + "loss": 0.7518548965454102, + "step": 2070 + }, + { + "epoch": 0.33397559409120103, + "grad_norm": 1.8038702247848208, + "learning_rate": 7.860583300610849e-06, + "loss": 0.6832700252532959, + "step": 2080 + }, + { + "epoch": 0.33558124598587025, + "grad_norm": 1.6414105854167742, + "learning_rate": 7.838992818455542e-06, + "loss": 0.7141282081604003, + "step": 2090 + }, + { + "epoch": 0.3371868978805395, + "grad_norm": 1.918967202852482, + "learning_rate": 7.817323929953575e-06, + "loss": 0.7613588333129883, + "step": 2100 + }, + { + "epoch": 0.33879254977520873, + "grad_norm": 1.7884788315718811, + "learning_rate": 7.795577233549006e-06, + "loss": 0.7314887046813965, + "step": 2110 + }, + { + "epoch": 0.340398201669878, + "grad_norm": 1.7578597871238686, + "learning_rate": 7.773753329834767e-06, + "loss": 0.6956823348999024, + "step": 2120 + }, + { + "epoch": 0.3420038535645472, + "grad_norm": 1.7192989942916452, + "learning_rate": 7.751852821536073e-06, + "loss": 0.7687590599060059, + "step": 2130 + }, + { + "epoch": 0.3436095054592164, + "grad_norm": 1.897575905000496, + "learning_rate": 7.729876313493781e-06, + "loss": 0.7444051265716553, + "step": 2140 + }, + { + "epoch": 0.3452151573538857, + "grad_norm": 1.9737469804350343, + "learning_rate": 7.70782441264768e-06, + "loss": 0.7778015613555909, + "step": 2150 + }, + { + "epoch": 0.3468208092485549, + "grad_norm": 1.7624130678825956, + "learning_rate": 7.68569772801974e-06, + "loss": 0.7128086090087891, + "step": 2160 + }, + { + "epoch": 0.3484264611432242, + "grad_norm": 2.1270906968507917, + "learning_rate": 7.663496870697267e-06, + "loss": 0.7526941299438477, + "step": 2170 + }, + { + "epoch": 0.3500321130378934, + "grad_norm": 1.7446416063759722, + "learning_rate": 7.641222453816064e-06, + "loss": 0.7122650146484375, + "step": 2180 + }, + { + "epoch": 0.3516377649325626, + "grad_norm": 1.8945875657421267, + "learning_rate": 7.618875092543467e-06, + "loss": 0.727197265625, + "step": 2190 + }, + { + "epoch": 0.35324341682723187, + "grad_norm": 1.9106945656360363, + "learning_rate": 7.596455404061365e-06, + "loss": 0.7072073459625244, + "step": 2200 + }, + { + "epoch": 0.35324341682723187, + "eval_loss": 0.6732829213142395, + "eval_runtime": 100.1797, + "eval_samples_per_second": 20.194, + "eval_steps_per_second": 5.051, + "eval_token_acc": 0.778239766677067, + "step": 2200 + }, + { + "epoch": 0.3548490687219011, + "grad_norm": 1.8773845986707667, + "learning_rate": 7.5739640075491546e-06, + "loss": 0.7187977313995362, + "step": 2210 + }, + { + "epoch": 0.35645472061657035, + "grad_norm": 1.9889746803856923, + "learning_rate": 7.551401524166646e-06, + "loss": 0.720254373550415, + "step": 2220 + }, + { + "epoch": 0.35806037251123957, + "grad_norm": 1.6923801552458178, + "learning_rate": 7.5287685770369e-06, + "loss": 0.6862672805786133, + "step": 2230 + }, + { + "epoch": 0.3596660244059088, + "grad_norm": 1.9583643264209265, + "learning_rate": 7.506065791229018e-06, + "loss": 0.7031614780426025, + "step": 2240 + }, + { + "epoch": 0.36127167630057805, + "grad_norm": 1.9020847905208893, + "learning_rate": 7.48329379374089e-06, + "loss": 0.7478256702423096, + "step": 2250 + }, + { + "epoch": 0.36287732819524726, + "grad_norm": 2.1005274782078276, + "learning_rate": 7.460453213481862e-06, + "loss": 0.7472408294677735, + "step": 2260 + }, + { + "epoch": 0.36448298008991653, + "grad_norm": 1.8211786447513192, + "learning_rate": 7.437544681255383e-06, + "loss": 0.7363036632537842, + "step": 2270 + }, + { + "epoch": 0.36608863198458574, + "grad_norm": 1.7715962280794173, + "learning_rate": 7.414568829741572e-06, + "loss": 0.6726340293884278, + "step": 2280 + }, + { + "epoch": 0.36769428387925496, + "grad_norm": 1.8917230150289452, + "learning_rate": 7.3915262934797525e-06, + "loss": 0.7265505790710449, + "step": 2290 + }, + { + "epoch": 0.3692999357739242, + "grad_norm": 2.70306165575983, + "learning_rate": 7.368417708850923e-06, + "loss": 0.7766015529632568, + "step": 2300 + }, + { + "epoch": 0.37090558766859344, + "grad_norm": 1.9625809458137666, + "learning_rate": 7.3452437140601855e-06, + "loss": 0.7961873531341552, + "step": 2310 + }, + { + "epoch": 0.3725112395632627, + "grad_norm": 2.3354598106113844, + "learning_rate": 7.322004949119114e-06, + "loss": 0.7435796737670899, + "step": 2320 + }, + { + "epoch": 0.3741168914579319, + "grad_norm": 1.7052628525681701, + "learning_rate": 7.298702055828086e-06, + "loss": 0.7392897605895996, + "step": 2330 + }, + { + "epoch": 0.37572254335260113, + "grad_norm": 1.7874835493922498, + "learning_rate": 7.275335677758553e-06, + "loss": 0.7336819648742676, + "step": 2340 + }, + { + "epoch": 0.3773281952472704, + "grad_norm": 1.4173055115054254, + "learning_rate": 7.251906460235268e-06, + "loss": 0.6930481433868408, + "step": 2350 + }, + { + "epoch": 0.3789338471419396, + "grad_norm": 1.6721852496901906, + "learning_rate": 7.228415050318463e-06, + "loss": 0.7872378826141357, + "step": 2360 + }, + { + "epoch": 0.3805394990366089, + "grad_norm": 1.9633631265419778, + "learning_rate": 7.204862096785978e-06, + "loss": 0.7082842826843262, + "step": 2370 + }, + { + "epoch": 0.3821451509312781, + "grad_norm": 1.8911957209638606, + "learning_rate": 7.181248250115346e-06, + "loss": 0.7364721775054932, + "step": 2380 + }, + { + "epoch": 0.3837508028259473, + "grad_norm": 1.832997056599628, + "learning_rate": 7.1575741624658215e-06, + "loss": 0.7178568840026855, + "step": 2390 + }, + { + "epoch": 0.3853564547206166, + "grad_norm": 1.797539058974665, + "learning_rate": 7.1338404876603784e-06, + "loss": 0.7448820114135742, + "step": 2400 + }, + { + "epoch": 0.3853564547206166, + "eval_loss": 0.6699023246765137, + "eval_runtime": 101.0776, + "eval_samples_per_second": 20.014, + "eval_steps_per_second": 5.006, + "eval_token_acc": 0.7787461561948874, + "step": 2400 + }, + { + "epoch": 0.3869621066152858, + "grad_norm": 1.6601258550350135, + "learning_rate": 7.110047881167647e-06, + "loss": 0.7200119972229004, + "step": 2410 + }, + { + "epoch": 0.38856775850995506, + "grad_norm": 1.8606082672724402, + "learning_rate": 7.086197000083812e-06, + "loss": 0.7550562858581543, + "step": 2420 + }, + { + "epoch": 0.3901734104046243, + "grad_norm": 1.926057437824125, + "learning_rate": 7.0622885031144685e-06, + "loss": 0.7396778583526611, + "step": 2430 + }, + { + "epoch": 0.3917790622992935, + "grad_norm": 2.053918702058126, + "learning_rate": 7.038323050556426e-06, + "loss": 0.7883121490478515, + "step": 2440 + }, + { + "epoch": 0.39338471419396276, + "grad_norm": 1.793367615238582, + "learning_rate": 7.014301304279476e-06, + "loss": 0.7374165534973145, + "step": 2450 + }, + { + "epoch": 0.394990366088632, + "grad_norm": 2.202257715478466, + "learning_rate": 6.990223927708107e-06, + "loss": 0.7440276145935059, + "step": 2460 + }, + { + "epoch": 0.39659601798330124, + "grad_norm": 1.9298723179304524, + "learning_rate": 6.966091585803191e-06, + "loss": 0.7077909946441651, + "step": 2470 + }, + { + "epoch": 0.39820166987797045, + "grad_norm": 2.0285592686096656, + "learning_rate": 6.94190494504361e-06, + "loss": 0.7101529598236084, + "step": 2480 + }, + { + "epoch": 0.39980732177263967, + "grad_norm": 1.9995862014508614, + "learning_rate": 6.917664673407858e-06, + "loss": 0.7186386108398437, + "step": 2490 + }, + { + "epoch": 0.40141297366730894, + "grad_norm": 1.7943762448203864, + "learning_rate": 6.893371440355585e-06, + "loss": 0.716456413269043, + "step": 2500 + }, + { + "epoch": 0.40301862556197815, + "grad_norm": 1.8687538472669623, + "learning_rate": 6.8690259168091115e-06, + "loss": 0.7763209342956543, + "step": 2510 + }, + { + "epoch": 0.4046242774566474, + "grad_norm": 1.6838226140000359, + "learning_rate": 6.8446287751349e-06, + "loss": 0.711727523803711, + "step": 2520 + }, + { + "epoch": 0.40622992935131663, + "grad_norm": 1.6964778939304066, + "learning_rate": 6.820180689124984e-06, + "loss": 0.6991249084472656, + "step": 2530 + }, + { + "epoch": 0.40783558124598585, + "grad_norm": 1.7927684693226813, + "learning_rate": 6.795682333978365e-06, + "loss": 0.7027994155883789, + "step": 2540 + }, + { + "epoch": 0.4094412331406551, + "grad_norm": 2.013968267642278, + "learning_rate": 6.771134386282355e-06, + "loss": 0.7337832450866699, + "step": 2550 + }, + { + "epoch": 0.4110468850353243, + "grad_norm": 1.6812600519591554, + "learning_rate": 6.7465375239939e-06, + "loss": 0.7464197635650635, + "step": 2560 + }, + { + "epoch": 0.4126525369299936, + "grad_norm": 1.9037670571068173, + "learning_rate": 6.721892426420851e-06, + "loss": 0.6945043563842773, + "step": 2570 + }, + { + "epoch": 0.4142581888246628, + "grad_norm": 1.8058567638830525, + "learning_rate": 6.697199774203203e-06, + "loss": 0.6616389751434326, + "step": 2580 + }, + { + "epoch": 0.415863840719332, + "grad_norm": 1.924596069745487, + "learning_rate": 6.6724602492943035e-06, + "loss": 0.725306510925293, + "step": 2590 + }, + { + "epoch": 0.4174694926140013, + "grad_norm": 1.824557569415237, + "learning_rate": 6.64767453494201e-06, + "loss": 0.726342248916626, + "step": 2600 + }, + { + "epoch": 0.4174694926140013, + "eval_loss": 0.665752649307251, + "eval_runtime": 100.2911, + "eval_samples_per_second": 20.171, + "eval_steps_per_second": 5.045, + "eval_token_acc": 0.7796020963024388, + "step": 2600 + }, + { + "epoch": 0.4190751445086705, + "grad_norm": 2.124267190283648, + "learning_rate": 6.6228433156698295e-06, + "loss": 0.7493174552917481, + "step": 2610 + }, + { + "epoch": 0.4206807964033398, + "grad_norm": 1.7913149024168675, + "learning_rate": 6.597967277258003e-06, + "loss": 0.7273980617523194, + "step": 2620 + }, + { + "epoch": 0.422286448298009, + "grad_norm": 1.8889920586323337, + "learning_rate": 6.573047106724574e-06, + "loss": 0.7230459690093994, + "step": 2630 + }, + { + "epoch": 0.4238921001926782, + "grad_norm": 2.366710188379232, + "learning_rate": 6.548083492306413e-06, + "loss": 0.7138511657714843, + "step": 2640 + }, + { + "epoch": 0.42549775208734747, + "grad_norm": 1.6217613788371101, + "learning_rate": 6.523077123440207e-06, + "loss": 0.7306194305419922, + "step": 2650 + }, + { + "epoch": 0.4271034039820167, + "grad_norm": 1.9317008242725322, + "learning_rate": 6.498028690743422e-06, + "loss": 0.7434864997863769, + "step": 2660 + }, + { + "epoch": 0.42870905587668595, + "grad_norm": 1.9815053943156722, + "learning_rate": 6.472938885995229e-06, + "loss": 0.6782712936401367, + "step": 2670 + }, + { + "epoch": 0.43031470777135516, + "grad_norm": 1.7541812536369152, + "learning_rate": 6.447808402117399e-06, + "loss": 0.680427074432373, + "step": 2680 + }, + { + "epoch": 0.4319203596660244, + "grad_norm": 1.7689207888279326, + "learning_rate": 6.4226379331551625e-06, + "loss": 0.7533660888671875, + "step": 2690 + }, + { + "epoch": 0.43352601156069365, + "grad_norm": 1.8560483228499987, + "learning_rate": 6.397428174258048e-06, + "loss": 0.6594399452209473, + "step": 2700 + }, + { + "epoch": 0.43513166345536286, + "grad_norm": 2.0233467391422835, + "learning_rate": 6.372179821660678e-06, + "loss": 0.7649170875549316, + "step": 2710 + }, + { + "epoch": 0.43673731535003213, + "grad_norm": 2.030975023954992, + "learning_rate": 6.346893572663544e-06, + "loss": 0.7382317543029785, + "step": 2720 + }, + { + "epoch": 0.43834296724470134, + "grad_norm": 1.9123338195053914, + "learning_rate": 6.321570125613744e-06, + "loss": 0.6933588981628418, + "step": 2730 + }, + { + "epoch": 0.4399486191393706, + "grad_norm": 2.0971123140916306, + "learning_rate": 6.296210179885708e-06, + "loss": 0.7823019981384277, + "step": 2740 + }, + { + "epoch": 0.4415542710340398, + "grad_norm": 1.5531867347941812, + "learning_rate": 6.270814435861864e-06, + "loss": 0.7157551765441894, + "step": 2750 + }, + { + "epoch": 0.44315992292870904, + "grad_norm": 1.5844075638735038, + "learning_rate": 6.245383594913312e-06, + "loss": 0.7315412521362304, + "step": 2760 + }, + { + "epoch": 0.4447655748233783, + "grad_norm": 1.8782313508796873, + "learning_rate": 6.219918359380444e-06, + "loss": 0.7463915348052979, + "step": 2770 + }, + { + "epoch": 0.4463712267180475, + "grad_norm": 1.6919689969132032, + "learning_rate": 6.19441943255355e-06, + "loss": 0.6843451499938965, + "step": 2780 + }, + { + "epoch": 0.4479768786127168, + "grad_norm": 1.7577177879387673, + "learning_rate": 6.1688875186533955e-06, + "loss": 0.7416125774383545, + "step": 2790 + }, + { + "epoch": 0.449582530507386, + "grad_norm": 1.8152525178335799, + "learning_rate": 6.143323322811776e-06, + "loss": 0.7234923362731933, + "step": 2800 + }, + { + "epoch": 0.449582530507386, + "eval_loss": 0.6638072729110718, + "eval_runtime": 99.5953, + "eval_samples_per_second": 20.312, + "eval_steps_per_second": 5.081, + "eval_token_acc": 0.7802753357437585, + "step": 2800 + }, + { + "epoch": 0.4511881824020552, + "grad_norm": 1.8407066956007139, + "learning_rate": 6.11772755105203e-06, + "loss": 0.7649462699890137, + "step": 2810 + }, + { + "epoch": 0.4527938342967245, + "grad_norm": 2.025366690651478, + "learning_rate": 6.092100910269556e-06, + "loss": 0.7118297576904297, + "step": 2820 + }, + { + "epoch": 0.4543994861913937, + "grad_norm": 1.882535993483909, + "learning_rate": 6.06644410821228e-06, + "loss": 0.7083555221557617, + "step": 2830 + }, + { + "epoch": 0.45600513808606297, + "grad_norm": 1.9132786155674317, + "learning_rate": 6.040757853461113e-06, + "loss": 0.7021913528442383, + "step": 2840 + }, + { + "epoch": 0.4576107899807322, + "grad_norm": 1.9130646960075248, + "learning_rate": 6.015042855410379e-06, + "loss": 0.7390027046203613, + "step": 2850 + }, + { + "epoch": 0.4592164418754014, + "grad_norm": 1.5813838739698574, + "learning_rate": 5.989299824248227e-06, + "loss": 0.6849452972412109, + "step": 2860 + }, + { + "epoch": 0.46082209377007066, + "grad_norm": 1.9649959319170844, + "learning_rate": 5.963529470937015e-06, + "loss": 0.7207717895507812, + "step": 2870 + }, + { + "epoch": 0.4624277456647399, + "grad_norm": 1.671436841803361, + "learning_rate": 5.937732507193671e-06, + "loss": 0.7367728233337403, + "step": 2880 + }, + { + "epoch": 0.46403339755940914, + "grad_norm": 1.5219254190640068, + "learning_rate": 5.911909645470045e-06, + "loss": 0.7643050193786621, + "step": 2890 + }, + { + "epoch": 0.46563904945407836, + "grad_norm": 1.6651969418471841, + "learning_rate": 5.886061598933228e-06, + "loss": 0.7690213203430176, + "step": 2900 + }, + { + "epoch": 0.46724470134874757, + "grad_norm": 2.035256573817491, + "learning_rate": 5.860189081445854e-06, + "loss": 0.751746940612793, + "step": 2910 + }, + { + "epoch": 0.46885035324341684, + "grad_norm": 1.6085243690709197, + "learning_rate": 5.834292807546392e-06, + "loss": 0.746894359588623, + "step": 2920 + }, + { + "epoch": 0.47045600513808605, + "grad_norm": 1.7904211777209753, + "learning_rate": 5.808373492429405e-06, + "loss": 0.7455621719360351, + "step": 2930 + }, + { + "epoch": 0.4720616570327553, + "grad_norm": 2.1015882721546486, + "learning_rate": 5.782431851925801e-06, + "loss": 0.710727596282959, + "step": 2940 + }, + { + "epoch": 0.47366730892742454, + "grad_norm": 1.8851211128040317, + "learning_rate": 5.75646860248306e-06, + "loss": 0.7258530616760254, + "step": 2950 + }, + { + "epoch": 0.47527296082209375, + "grad_norm": 1.7596039756078887, + "learning_rate": 5.730484461145455e-06, + "loss": 0.7183322429656982, + "step": 2960 + }, + { + "epoch": 0.476878612716763, + "grad_norm": 1.7585093428217082, + "learning_rate": 5.704480145534243e-06, + "loss": 0.6958836078643799, + "step": 2970 + }, + { + "epoch": 0.47848426461143223, + "grad_norm": 1.7759136501368777, + "learning_rate": 5.678456373827843e-06, + "loss": 0.6852969169616699, + "step": 2980 + }, + { + "epoch": 0.4800899165061015, + "grad_norm": 1.7327874227415498, + "learning_rate": 5.652413864742016e-06, + "loss": 0.7584549427032471, + "step": 2990 + }, + { + "epoch": 0.4816955684007707, + "grad_norm": 1.8579814597845732, + "learning_rate": 5.626353337509994e-06, + "loss": 0.7292413711547852, + "step": 3000 + }, + { + "epoch": 0.4816955684007707, + "eval_loss": 0.6603939533233643, + "eval_runtime": 99.8487, + "eval_samples_per_second": 20.261, + "eval_steps_per_second": 5.068, + "eval_token_acc": 0.7815250416707684, + "step": 3000 + }, + { + "epoch": 0.4833012202954399, + "grad_norm": 1.6620639526626975, + "learning_rate": 5.600275511862636e-06, + "loss": 0.6874475479125977, + "step": 3010 + }, + { + "epoch": 0.4849068721901092, + "grad_norm": 1.863548208870452, + "learning_rate": 5.574181108008539e-06, + "loss": 0.7283189296722412, + "step": 3020 + }, + { + "epoch": 0.4865125240847784, + "grad_norm": 1.7019535500961747, + "learning_rate": 5.548070846614153e-06, + "loss": 0.7144774436950684, + "step": 3030 + }, + { + "epoch": 0.4881181759794477, + "grad_norm": 1.8180271114688507, + "learning_rate": 5.521945448783874e-06, + "loss": 0.7144730091094971, + "step": 3040 + }, + { + "epoch": 0.4897238278741169, + "grad_norm": 1.6469354529009128, + "learning_rate": 5.495805636040135e-06, + "loss": 0.702541732788086, + "step": 3050 + }, + { + "epoch": 0.4913294797687861, + "grad_norm": 1.4253159494568708, + "learning_rate": 5.469652130303471e-06, + "loss": 0.7350476264953614, + "step": 3060 + }, + { + "epoch": 0.4929351316634554, + "grad_norm": 1.6761584639579492, + "learning_rate": 5.443485653872589e-06, + "loss": 0.7056559085845947, + "step": 3070 + }, + { + "epoch": 0.4945407835581246, + "grad_norm": 2.091157632155003, + "learning_rate": 5.417306929404413e-06, + "loss": 0.7697329044342041, + "step": 3080 + }, + { + "epoch": 0.49614643545279385, + "grad_norm": 1.8425062990475631, + "learning_rate": 5.391116679894131e-06, + "loss": 0.7427218914031982, + "step": 3090 + }, + { + "epoch": 0.49775208734746307, + "grad_norm": 1.7957934599197594, + "learning_rate": 5.364915628655227e-06, + "loss": 0.6918140888214112, + "step": 3100 + }, + { + "epoch": 0.4993577392421323, + "grad_norm": 1.9779972048893961, + "learning_rate": 5.3387044992995e-06, + "loss": 0.7020491123199463, + "step": 3110 + }, + { + "epoch": 0.5009633911368016, + "grad_norm": 1.9981792268408507, + "learning_rate": 5.312484015717087e-06, + "loss": 0.7167632102966308, + "step": 3120 + }, + { + "epoch": 0.5025690430314708, + "grad_norm": 1.8909911302926332, + "learning_rate": 5.286254902056462e-06, + "loss": 0.7625884532928466, + "step": 3130 + }, + { + "epoch": 0.50417469492614, + "grad_norm": 1.61762455157073, + "learning_rate": 5.2600178827044476e-06, + "loss": 0.6554212093353271, + "step": 3140 + }, + { + "epoch": 0.5057803468208093, + "grad_norm": 1.9936400724678622, + "learning_rate": 5.233773682266198e-06, + "loss": 0.7177192211151123, + "step": 3150 + }, + { + "epoch": 0.5073859987154785, + "grad_norm": 2.0016810272501364, + "learning_rate": 5.2075230255451924e-06, + "loss": 0.6999452114105225, + "step": 3160 + }, + { + "epoch": 0.5089916506101477, + "grad_norm": 1.4929333492848669, + "learning_rate": 5.181266637523225e-06, + "loss": 0.7226743698120117, + "step": 3170 + }, + { + "epoch": 0.5105973025048169, + "grad_norm": 1.7985727217161158, + "learning_rate": 5.155005243340364e-06, + "loss": 0.6844250679016113, + "step": 3180 + }, + { + "epoch": 0.5122029543994862, + "grad_norm": 1.732864365144089, + "learning_rate": 5.1287395682749444e-06, + "loss": 0.708368968963623, + "step": 3190 + }, + { + "epoch": 0.5138086062941555, + "grad_norm": 1.6036953471683275, + "learning_rate": 5.102470337723524e-06, + "loss": 0.7594008445739746, + "step": 3200 + }, + { + "epoch": 0.5138086062941555, + "eval_loss": 0.6590485572814941, + "eval_runtime": 99.4458, + "eval_samples_per_second": 20.343, + "eval_steps_per_second": 5.088, + "eval_token_acc": 0.7822341538456404, + "step": 3200 + }, + { + "epoch": 0.5154142581888247, + "grad_norm": 1.9434983597979074, + "learning_rate": 5.0761982771808595e-06, + "loss": 0.690369701385498, + "step": 3210 + }, + { + "epoch": 0.5170199100834939, + "grad_norm": 1.6748286006153636, + "learning_rate": 5.049924112219859e-06, + "loss": 0.744829797744751, + "step": 3220 + }, + { + "epoch": 0.5186255619781631, + "grad_norm": 1.5457805731661085, + "learning_rate": 5.023648568471559e-06, + "loss": 0.6961235046386719, + "step": 3230 + }, + { + "epoch": 0.5202312138728323, + "grad_norm": 1.6660310158387102, + "learning_rate": 4.997372371605066e-06, + "loss": 0.7534513473510742, + "step": 3240 + }, + { + "epoch": 0.5218368657675017, + "grad_norm": 1.7718731852467215, + "learning_rate": 4.971096247307528e-06, + "loss": 0.7069045066833496, + "step": 3250 + }, + { + "epoch": 0.5234425176621709, + "grad_norm": 1.6283766654055158, + "learning_rate": 4.944820921264089e-06, + "loss": 0.6802540302276612, + "step": 3260 + }, + { + "epoch": 0.5250481695568401, + "grad_norm": 2.2262011526125107, + "learning_rate": 4.918547119137846e-06, + "loss": 0.7522598266601562, + "step": 3270 + }, + { + "epoch": 0.5266538214515093, + "grad_norm": 1.74540928610349, + "learning_rate": 4.89227556654981e-06, + "loss": 0.7161440372467041, + "step": 3280 + }, + { + "epoch": 0.5282594733461785, + "grad_norm": 1.4513773030560617, + "learning_rate": 4.866006989058862e-06, + "loss": 0.6914768218994141, + "step": 3290 + }, + { + "epoch": 0.5298651252408478, + "grad_norm": 1.7847841689336374, + "learning_rate": 4.839742112141725e-06, + "loss": 0.6942119598388672, + "step": 3300 + }, + { + "epoch": 0.531470777135517, + "grad_norm": 1.7175053440669847, + "learning_rate": 4.813481661172912e-06, + "loss": 0.6722054481506348, + "step": 3310 + }, + { + "epoch": 0.5330764290301863, + "grad_norm": 1.7006060538832268, + "learning_rate": 4.787226361404706e-06, + "loss": 0.7729241371154785, + "step": 3320 + }, + { + "epoch": 0.5346820809248555, + "grad_norm": 1.9551619500224633, + "learning_rate": 4.760976937947128e-06, + "loss": 0.7557572364807129, + "step": 3330 + }, + { + "epoch": 0.5362877328195247, + "grad_norm": 1.9346894609017689, + "learning_rate": 4.7347341157479055e-06, + "loss": 0.7043597221374511, + "step": 3340 + }, + { + "epoch": 0.537893384714194, + "grad_norm": 2.0892624262979878, + "learning_rate": 4.708498619572455e-06, + "loss": 0.6981153011322021, + "step": 3350 + }, + { + "epoch": 0.5394990366088632, + "grad_norm": 1.8011714452824432, + "learning_rate": 4.682271173983865e-06, + "loss": 0.6955915451049804, + "step": 3360 + }, + { + "epoch": 0.5411046885035324, + "grad_norm": 1.8385763388931964, + "learning_rate": 4.6560525033228885e-06, + "loss": 0.7396990776062011, + "step": 3370 + }, + { + "epoch": 0.5427103403982017, + "grad_norm": 1.9241555680433726, + "learning_rate": 4.629843331687935e-06, + "loss": 0.7320691108703613, + "step": 3380 + }, + { + "epoch": 0.5443159922928709, + "grad_norm": 1.8609663841971344, + "learning_rate": 4.603644382915069e-06, + "loss": 0.7013369560241699, + "step": 3390 + }, + { + "epoch": 0.5459216441875402, + "grad_norm": 1.5651123605862947, + "learning_rate": 4.577456380558028e-06, + "loss": 0.7569509506225586, + "step": 3400 + }, + { + "epoch": 0.5459216441875402, + "eval_loss": 0.6562788486480713, + "eval_runtime": 101.0782, + "eval_samples_per_second": 20.014, + "eval_steps_per_second": 5.006, + "eval_token_acc": 0.7831568339225916, + "step": 3400 + }, + { + "epoch": 0.5475272960822094, + "grad_norm": 1.5797728712399979, + "learning_rate": 4.551280047868233e-06, + "loss": 0.703862476348877, + "step": 3410 + }, + { + "epoch": 0.5491329479768786, + "grad_norm": 2.104046311869008, + "learning_rate": 4.525116107774815e-06, + "loss": 0.6947150230407715, + "step": 3420 + }, + { + "epoch": 0.5507385998715478, + "grad_norm": 1.9353364330514562, + "learning_rate": 4.498965282864654e-06, + "loss": 0.7750646591186523, + "step": 3430 + }, + { + "epoch": 0.552344251766217, + "grad_norm": 1.6516572356278183, + "learning_rate": 4.472828295362417e-06, + "loss": 0.7100794315338135, + "step": 3440 + }, + { + "epoch": 0.5539499036608864, + "grad_norm": 1.8878923547505169, + "learning_rate": 4.446705867110613e-06, + "loss": 0.7117058753967285, + "step": 3450 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.7084225805175084, + "learning_rate": 4.420598719549661e-06, + "loss": 0.6922745704650879, + "step": 3460 + }, + { + "epoch": 0.5571612074502248, + "grad_norm": 1.8216538078021467, + "learning_rate": 4.39450757369796e-06, + "loss": 0.6797658920288085, + "step": 3470 + }, + { + "epoch": 0.558766859344894, + "grad_norm": 1.6982254987436856, + "learning_rate": 4.368433150131983e-06, + "loss": 0.6903456211090088, + "step": 3480 + }, + { + "epoch": 0.5603725112395632, + "grad_norm": 1.8338849318375383, + "learning_rate": 4.342376168966368e-06, + "loss": 0.6925700187683106, + "step": 3490 + }, + { + "epoch": 0.5619781631342325, + "grad_norm": 1.6205656689270516, + "learning_rate": 4.316337349834041e-06, + "loss": 0.6894615650177002, + "step": 3500 + }, + { + "epoch": 0.5635838150289018, + "grad_norm": 1.8627217085763261, + "learning_rate": 4.290317411866329e-06, + "loss": 0.7587858200073242, + "step": 3510 + }, + { + "epoch": 0.565189466923571, + "grad_norm": 1.7262427202144772, + "learning_rate": 4.264317073673108e-06, + "loss": 0.6634279727935791, + "step": 3520 + }, + { + "epoch": 0.5667951188182402, + "grad_norm": 1.7942547624076908, + "learning_rate": 4.238337053322954e-06, + "loss": 0.6912859916687012, + "step": 3530 + }, + { + "epoch": 0.5684007707129094, + "grad_norm": 1.9726052068008915, + "learning_rate": 4.212378068323312e-06, + "loss": 0.7470963001251221, + "step": 3540 + }, + { + "epoch": 0.5700064226075787, + "grad_norm": 1.8122951442259065, + "learning_rate": 4.186440835600677e-06, + "loss": 0.708991813659668, + "step": 3550 + }, + { + "epoch": 0.5716120745022479, + "grad_norm": 1.9588586663070653, + "learning_rate": 4.1605260714808e-06, + "loss": 0.7249481201171875, + "step": 3560 + }, + { + "epoch": 0.5732177263969171, + "grad_norm": 1.810489889692105, + "learning_rate": 4.134634491668903e-06, + "loss": 0.680173110961914, + "step": 3570 + }, + { + "epoch": 0.5748233782915864, + "grad_norm": 1.6521777772159827, + "learning_rate": 4.108766811229906e-06, + "loss": 0.7033731937408447, + "step": 3580 + }, + { + "epoch": 0.5764290301862556, + "grad_norm": 1.7201532835806859, + "learning_rate": 4.0829237445686895e-06, + "loss": 0.7144006729125977, + "step": 3590 + }, + { + "epoch": 0.5780346820809249, + "grad_norm": 1.8473455193169876, + "learning_rate": 4.057106005410356e-06, + "loss": 0.7390274047851563, + "step": 3600 + }, + { + "epoch": 0.5780346820809249, + "eval_loss": 0.6529011726379395, + "eval_runtime": 100.1569, + "eval_samples_per_second": 20.198, + "eval_steps_per_second": 5.052, + "eval_token_acc": 0.7836457041984446, + "step": 3600 + }, + { + "epoch": 0.5796403339755941, + "grad_norm": 1.6568429454392342, + "learning_rate": 4.0313143067805255e-06, + "loss": 0.7353650093078613, + "step": 3610 + }, + { + "epoch": 0.5812459858702633, + "grad_norm": 1.7711489931478204, + "learning_rate": 4.005549360985633e-06, + "loss": 0.652749490737915, + "step": 3620 + }, + { + "epoch": 0.5828516377649325, + "grad_norm": 1.757927815054455, + "learning_rate": 3.979811879593269e-06, + "loss": 0.7223974227905273, + "step": 3630 + }, + { + "epoch": 0.5844572896596018, + "grad_norm": 2.066360457008701, + "learning_rate": 3.954102573412517e-06, + "loss": 0.7130667209625244, + "step": 3640 + }, + { + "epoch": 0.5860629415542711, + "grad_norm": 1.9898510462446428, + "learning_rate": 3.9284221524743285e-06, + "loss": 0.6991080284118653, + "step": 3650 + }, + { + "epoch": 0.5876685934489403, + "grad_norm": 1.7615438213992136, + "learning_rate": 3.902771326011914e-06, + "loss": 0.6893427848815918, + "step": 3660 + }, + { + "epoch": 0.5892742453436095, + "grad_norm": 1.982320389173454, + "learning_rate": 3.877150802441151e-06, + "loss": 0.7484220027923584, + "step": 3670 + }, + { + "epoch": 0.5908798972382787, + "grad_norm": 1.741693118016323, + "learning_rate": 3.851561289341023e-06, + "loss": 0.6734917640686036, + "step": 3680 + }, + { + "epoch": 0.5924855491329479, + "grad_norm": 1.7966100597081194, + "learning_rate": 3.8260034934340774e-06, + "loss": 0.6977890014648438, + "step": 3690 + }, + { + "epoch": 0.5940912010276173, + "grad_norm": 1.8214854618786471, + "learning_rate": 3.800478120566906e-06, + "loss": 0.6718869209289551, + "step": 3700 + }, + { + "epoch": 0.5956968529222865, + "grad_norm": 1.936538410584516, + "learning_rate": 3.7749858756906516e-06, + "loss": 0.7291570663452148, + "step": 3710 + }, + { + "epoch": 0.5973025048169557, + "grad_norm": 1.9259131709896102, + "learning_rate": 3.749527462841539e-06, + "loss": 0.7349955558776855, + "step": 3720 + }, + { + "epoch": 0.5989081567116249, + "grad_norm": 1.7114063191334197, + "learning_rate": 3.724103585121436e-06, + "loss": 0.6840395927429199, + "step": 3730 + }, + { + "epoch": 0.6005138086062941, + "grad_norm": 1.8004891247262185, + "learning_rate": 3.698714944678424e-06, + "loss": 0.7046355724334716, + "step": 3740 + }, + { + "epoch": 0.6021194605009634, + "grad_norm": 1.9635884438112328, + "learning_rate": 3.6733622426874184e-06, + "loss": 0.7282588005065918, + "step": 3750 + }, + { + "epoch": 0.6037251123956326, + "grad_norm": 1.8288920625641636, + "learning_rate": 3.648046179330796e-06, + "loss": 0.6897663116455078, + "step": 3760 + }, + { + "epoch": 0.6053307642903019, + "grad_norm": 1.956581186568473, + "learning_rate": 3.62276745377906e-06, + "loss": 0.7430720806121827, + "step": 3770 + }, + { + "epoch": 0.6069364161849711, + "grad_norm": 1.905669012280911, + "learning_rate": 3.597526764171532e-06, + "loss": 0.763132905960083, + "step": 3780 + }, + { + "epoch": 0.6085420680796403, + "grad_norm": 1.692613025987985, + "learning_rate": 3.5723248075970684e-06, + "loss": 0.7154220104217529, + "step": 3790 + }, + { + "epoch": 0.6101477199743096, + "grad_norm": 1.6188001979454867, + "learning_rate": 3.547162280074813e-06, + "loss": 0.6731904983520508, + "step": 3800 + }, + { + "epoch": 0.6101477199743096, + "eval_loss": 0.6499109268188477, + "eval_runtime": 101.607, + "eval_samples_per_second": 19.91, + "eval_steps_per_second": 4.98, + "eval_token_acc": 0.7846401297425005, + "step": 3800 + }, + { + "epoch": 0.6117533718689788, + "grad_norm": 1.6466933977964522, + "learning_rate": 3.5220398765349662e-06, + "loss": 0.6557781219482421, + "step": 3810 + }, + { + "epoch": 0.613359023763648, + "grad_norm": 1.650956939754262, + "learning_rate": 3.4969582907996015e-06, + "loss": 0.7916486740112305, + "step": 3820 + }, + { + "epoch": 0.6149646756583173, + "grad_norm": 1.9499600064872187, + "learning_rate": 3.471918215563499e-06, + "loss": 0.7331917762756348, + "step": 3830 + }, + { + "epoch": 0.6165703275529865, + "grad_norm": 1.7488554020922835, + "learning_rate": 3.4469203423750152e-06, + "loss": 0.7139426708221436, + "step": 3840 + }, + { + "epoch": 0.6181759794476558, + "grad_norm": 1.9481794866010824, + "learning_rate": 3.421965361616985e-06, + "loss": 0.7002578258514405, + "step": 3850 + }, + { + "epoch": 0.619781631342325, + "grad_norm": 1.8164493133989255, + "learning_rate": 3.3970539624876565e-06, + "loss": 0.7944469451904297, + "step": 3860 + }, + { + "epoch": 0.6213872832369942, + "grad_norm": 1.7973574254384317, + "learning_rate": 3.372186832981652e-06, + "loss": 0.7096256256103516, + "step": 3870 + }, + { + "epoch": 0.6229929351316634, + "grad_norm": 1.8905305819711578, + "learning_rate": 3.3473646598709724e-06, + "loss": 0.7385225772857666, + "step": 3880 + }, + { + "epoch": 0.6245985870263326, + "grad_norm": 1.9720483336100236, + "learning_rate": 3.322588128686027e-06, + "loss": 0.631558609008789, + "step": 3890 + }, + { + "epoch": 0.626204238921002, + "grad_norm": 1.9199256983971729, + "learning_rate": 3.297857923696702e-06, + "loss": 0.7101581573486329, + "step": 3900 + }, + { + "epoch": 0.6278098908156712, + "grad_norm": 1.7281699563667379, + "learning_rate": 3.273174727893463e-06, + "loss": 0.6946653366088867, + "step": 3910 + }, + { + "epoch": 0.6294155427103404, + "grad_norm": 1.8488768775022235, + "learning_rate": 3.248539222968489e-06, + "loss": 0.7121337413787842, + "step": 3920 + }, + { + "epoch": 0.6310211946050096, + "grad_norm": 1.9382810286305994, + "learning_rate": 3.223952089296854e-06, + "loss": 0.6842686653137207, + "step": 3930 + }, + { + "epoch": 0.6326268464996788, + "grad_norm": 1.6643793135842617, + "learning_rate": 3.199414005917726e-06, + "loss": 0.7061540603637695, + "step": 3940 + }, + { + "epoch": 0.6342324983943481, + "grad_norm": 2.724263331256564, + "learning_rate": 3.1749256505156203e-06, + "loss": 0.7035871505737304, + "step": 3950 + }, + { + "epoch": 0.6358381502890174, + "grad_norm": 2.0469158236441265, + "learning_rate": 3.150487699401681e-06, + "loss": 0.7187200546264648, + "step": 3960 + }, + { + "epoch": 0.6374438021836866, + "grad_norm": 2.0083569748830437, + "learning_rate": 3.1261008274950045e-06, + "loss": 0.7021552085876465, + "step": 3970 + }, + { + "epoch": 0.6390494540783558, + "grad_norm": 1.9963871377571232, + "learning_rate": 3.1017657083039974e-06, + "loss": 0.7082595825195312, + "step": 3980 + }, + { + "epoch": 0.640655105973025, + "grad_norm": 1.680927867718523, + "learning_rate": 3.0774830139077816e-06, + "loss": 0.6650783061981201, + "step": 3990 + }, + { + "epoch": 0.6422607578676943, + "grad_norm": 1.59265243871285, + "learning_rate": 3.0532534149376225e-06, + "loss": 0.6559285163879395, + "step": 4000 + }, + { + "epoch": 0.6422607578676943, + "eval_loss": 0.6435865759849548, + "eval_runtime": 100.3623, + "eval_samples_per_second": 20.157, + "eval_steps_per_second": 5.042, + "eval_token_acc": 0.7852274414732181, + "step": 4000 + }, + { + "epoch": 0.6438664097623635, + "grad_norm": 2.059823477222312, + "learning_rate": 3.0290775805584182e-06, + "loss": 0.6783754348754882, + "step": 4010 + }, + { + "epoch": 0.6454720616570327, + "grad_norm": 1.8047856556885493, + "learning_rate": 3.0049561784502125e-06, + "loss": 0.6974565029144287, + "step": 4020 + }, + { + "epoch": 0.647077713551702, + "grad_norm": 1.8332131891460988, + "learning_rate": 2.980889874789758e-06, + "loss": 0.7201285839080811, + "step": 4030 + }, + { + "epoch": 0.6486833654463712, + "grad_norm": 1.4349168104380714, + "learning_rate": 2.956879334232117e-06, + "loss": 0.669098949432373, + "step": 4040 + }, + { + "epoch": 0.6502890173410405, + "grad_norm": 1.6713525304929475, + "learning_rate": 2.9329252198923026e-06, + "loss": 0.6994924545288086, + "step": 4050 + }, + { + "epoch": 0.6518946692357097, + "grad_norm": 1.9182171362094331, + "learning_rate": 2.909028193326974e-06, + "loss": 0.7437381744384766, + "step": 4060 + }, + { + "epoch": 0.6535003211303789, + "grad_norm": 1.9925290937878184, + "learning_rate": 2.8851889145161515e-06, + "loss": 0.7246116161346435, + "step": 4070 + }, + { + "epoch": 0.6551059730250481, + "grad_norm": 2.0862100251659523, + "learning_rate": 2.861408041845002e-06, + "loss": 0.6847538948059082, + "step": 4080 + }, + { + "epoch": 0.6567116249197174, + "grad_norm": 1.8109480499644828, + "learning_rate": 2.8376862320856524e-06, + "loss": 0.6831855773925781, + "step": 4090 + }, + { + "epoch": 0.6583172768143867, + "grad_norm": 1.976494032777444, + "learning_rate": 2.814024140379048e-06, + "loss": 0.6985796451568603, + "step": 4100 + }, + { + "epoch": 0.6599229287090559, + "grad_norm": 1.9685296222085586, + "learning_rate": 2.7904224202168608e-06, + "loss": 0.6664264678955079, + "step": 4110 + }, + { + "epoch": 0.6615285806037251, + "grad_norm": 1.9281066229028763, + "learning_rate": 2.766881723423441e-06, + "loss": 0.719045352935791, + "step": 4120 + }, + { + "epoch": 0.6631342324983943, + "grad_norm": 1.6502284540715824, + "learning_rate": 2.7434027001378194e-06, + "loss": 0.6993152141571045, + "step": 4130 + }, + { + "epoch": 0.6647398843930635, + "grad_norm": 1.6697487416381616, + "learning_rate": 2.719985998795747e-06, + "loss": 0.7472553253173828, + "step": 4140 + }, + { + "epoch": 0.6663455362877329, + "grad_norm": 1.6636753892060745, + "learning_rate": 2.696632266111784e-06, + "loss": 0.7038356781005859, + "step": 4150 + }, + { + "epoch": 0.6679511881824021, + "grad_norm": 1.9803593784420075, + "learning_rate": 2.67334214706145e-06, + "loss": 0.7307730197906495, + "step": 4160 + }, + { + "epoch": 0.6695568400770713, + "grad_norm": 1.767255994774783, + "learning_rate": 2.6501162848634023e-06, + "loss": 0.7219781875610352, + "step": 4170 + }, + { + "epoch": 0.6711624919717405, + "grad_norm": 1.901874438910319, + "learning_rate": 2.6269553209616705e-06, + "loss": 0.6743021965026855, + "step": 4180 + }, + { + "epoch": 0.6727681438664097, + "grad_norm": 2.0677163684150965, + "learning_rate": 2.603859895007953e-06, + "loss": 0.7430953502655029, + "step": 4190 + }, + { + "epoch": 0.674373795761079, + "grad_norm": 1.6171922112559496, + "learning_rate": 2.5808306448439363e-06, + "loss": 0.6823254585266113, + "step": 4200 + }, + { + "epoch": 0.674373795761079, + "eval_loss": 0.6416329741477966, + "eval_runtime": 102.1529, + "eval_samples_per_second": 19.804, + "eval_steps_per_second": 4.953, + "eval_token_acc": 0.7860491773464522, + "step": 4200 + }, + { + "epoch": 0.6759794476557482, + "grad_norm": 2.156271432064396, + "learning_rate": 2.557868206483689e-06, + "loss": 0.7033274650573731, + "step": 4210 + }, + { + "epoch": 0.6775850995504175, + "grad_norm": 1.8682650555990705, + "learning_rate": 2.5349732140960924e-06, + "loss": 0.706195592880249, + "step": 4220 + }, + { + "epoch": 0.6791907514450867, + "grad_norm": 1.8511909561934217, + "learning_rate": 2.5121462999873304e-06, + "loss": 0.6629680633544922, + "step": 4230 + }, + { + "epoch": 0.680796403339756, + "grad_norm": 1.7968251606366754, + "learning_rate": 2.48938809458342e-06, + "loss": 0.6985820770263672, + "step": 4240 + }, + { + "epoch": 0.6824020552344252, + "grad_norm": 1.5956361541126427, + "learning_rate": 2.466699226412807e-06, + "loss": 0.6692060470581055, + "step": 4250 + }, + { + "epoch": 0.6840077071290944, + "grad_norm": 1.8716762576484385, + "learning_rate": 2.4440803220890054e-06, + "loss": 0.6867996215820312, + "step": 4260 + }, + { + "epoch": 0.6856133590237636, + "grad_norm": 1.937863972461209, + "learning_rate": 2.4215320062932884e-06, + "loss": 0.679615592956543, + "step": 4270 + }, + { + "epoch": 0.6872190109184328, + "grad_norm": 1.779698511514955, + "learning_rate": 2.399054901757442e-06, + "loss": 0.7149673461914062, + "step": 4280 + }, + { + "epoch": 0.6888246628131022, + "grad_norm": 1.9594648302014621, + "learning_rate": 2.3766496292465626e-06, + "loss": 0.6776654243469238, + "step": 4290 + }, + { + "epoch": 0.6904303147077714, + "grad_norm": 2.164984986049087, + "learning_rate": 2.3543168075419128e-06, + "loss": 0.7544286727905274, + "step": 4300 + }, + { + "epoch": 0.6920359666024406, + "grad_norm": 1.7026126548400184, + "learning_rate": 2.3320570534238333e-06, + "loss": 0.6883758544921875, + "step": 4310 + }, + { + "epoch": 0.6936416184971098, + "grad_norm": 1.894526210798688, + "learning_rate": 2.3098709816547126e-06, + "loss": 0.669807243347168, + "step": 4320 + }, + { + "epoch": 0.695247270391779, + "grad_norm": 1.7104031052730917, + "learning_rate": 2.2877592049620013e-06, + "loss": 0.6752557277679443, + "step": 4330 + }, + { + "epoch": 0.6968529222864484, + "grad_norm": 2.0154910175662177, + "learning_rate": 2.2657223340212937e-06, + "loss": 0.7199252128601075, + "step": 4340 + }, + { + "epoch": 0.6984585741811176, + "grad_norm": 1.6326002815510572, + "learning_rate": 2.243760977439463e-06, + "loss": 0.6786823749542237, + "step": 4350 + }, + { + "epoch": 0.7000642260757868, + "grad_norm": 1.8278106499138467, + "learning_rate": 2.2218757417378524e-06, + "loss": 0.7039000034332276, + "step": 4360 + }, + { + "epoch": 0.701669877970456, + "grad_norm": 1.5150426633022813, + "learning_rate": 2.2000672313355243e-06, + "loss": 0.6673988342285156, + "step": 4370 + }, + { + "epoch": 0.7032755298651252, + "grad_norm": 1.814010824208995, + "learning_rate": 2.178336048532567e-06, + "loss": 0.6703363418579101, + "step": 4380 + }, + { + "epoch": 0.7048811817597945, + "grad_norm": 1.932034342856998, + "learning_rate": 2.1566827934934625e-06, + "loss": 0.7365949630737305, + "step": 4390 + }, + { + "epoch": 0.7064868336544637, + "grad_norm": 1.583750341584408, + "learning_rate": 2.1351080642305087e-06, + "loss": 0.7208436012268067, + "step": 4400 + }, + { + "epoch": 0.7064868336544637, + "eval_loss": 0.6407771110534668, + "eval_runtime": 100.9662, + "eval_samples_per_second": 20.036, + "eval_steps_per_second": 5.012, + "eval_token_acc": 0.7865872683497375, + "step": 4400 + }, + { + "epoch": 0.708092485549133, + "grad_norm": 1.7120130450491946, + "learning_rate": 2.1136124565873067e-06, + "loss": 0.6615838050842285, + "step": 4410 + }, + { + "epoch": 0.7096981374438022, + "grad_norm": 1.8942066282925965, + "learning_rate": 2.092196564222301e-06, + "loss": 0.7101615905761719, + "step": 4420 + }, + { + "epoch": 0.7113037893384714, + "grad_norm": 2.0967864377771313, + "learning_rate": 2.070860978592389e-06, + "loss": 0.760159969329834, + "step": 4430 + }, + { + "epoch": 0.7129094412331407, + "grad_norm": 1.87264295172251, + "learning_rate": 2.04960628893658e-06, + "loss": 0.6784673690795898, + "step": 4440 + }, + { + "epoch": 0.7145150931278099, + "grad_norm": 1.5829797358177315, + "learning_rate": 2.0284330822597328e-06, + "loss": 0.697022819519043, + "step": 4450 + }, + { + "epoch": 0.7161207450224791, + "grad_norm": 1.954361401728327, + "learning_rate": 2.0073419433163287e-06, + "loss": 0.6933704376220703, + "step": 4460 + }, + { + "epoch": 0.7177263969171483, + "grad_norm": 1.8847843880125525, + "learning_rate": 1.9863334545943346e-06, + "loss": 0.6825516223907471, + "step": 4470 + }, + { + "epoch": 0.7193320488118176, + "grad_norm": 1.80323508483505, + "learning_rate": 1.96540819629911e-06, + "loss": 0.71434326171875, + "step": 4480 + }, + { + "epoch": 0.7209377007064869, + "grad_norm": 1.5642782564942102, + "learning_rate": 1.944566746337384e-06, + "loss": 0.6642905712127686, + "step": 4490 + }, + { + "epoch": 0.7225433526011561, + "grad_norm": 1.678835575797861, + "learning_rate": 1.9238096803012977e-06, + "loss": 0.7029307842254638, + "step": 4500 + }, + { + "epoch": 0.7241490044958253, + "grad_norm": 1.7368484560789226, + "learning_rate": 1.9031375714525024e-06, + "loss": 0.6675965309143066, + "step": 4510 + }, + { + "epoch": 0.7257546563904945, + "grad_norm": 2.0308775134008976, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.7120641231536865, + "step": 4520 + }, + { + "epoch": 0.7273603082851637, + "grad_norm": 1.632678314506496, + "learning_rate": 1.862050506616036e-06, + "loss": 0.699639892578125, + "step": 4530 + }, + { + "epoch": 0.7289659601798331, + "grad_norm": 1.7984916892790255, + "learning_rate": 1.841636685357071e-06, + "loss": 0.7304025650024414, + "step": 4540 + }, + { + "epoch": 0.7305716120745023, + "grad_norm": 1.8169707359801452, + "learning_rate": 1.8213100907114723e-06, + "loss": 0.6680910110473632, + "step": 4550 + }, + { + "epoch": 0.7321772639691715, + "grad_norm": 1.9689432226586039, + "learning_rate": 1.8010712840522787e-06, + "loss": 0.7318731307983398, + "step": 4560 + }, + { + "epoch": 0.7337829158638407, + "grad_norm": 1.9672898212636953, + "learning_rate": 1.7809208243280295e-06, + "loss": 0.6793680191040039, + "step": 4570 + }, + { + "epoch": 0.7353885677585099, + "grad_norm": 1.7885095729577234, + "learning_rate": 1.7608592680473286e-06, + "loss": 0.6592792510986328, + "step": 4580 + }, + { + "epoch": 0.7369942196531792, + "grad_norm": 1.7762408489737018, + "learning_rate": 1.740887169263477e-06, + "loss": 0.7005180358886719, + "step": 4590 + }, + { + "epoch": 0.7385998715478485, + "grad_norm": 1.7404523046374716, + "learning_rate": 1.7210050795591659e-06, + "loss": 0.6815290451049805, + "step": 4600 + }, + { + "epoch": 0.7385998715478485, + "eval_loss": 0.6378750205039978, + "eval_runtime": 101.5529, + "eval_samples_per_second": 19.921, + "eval_steps_per_second": 4.983, + "eval_token_acc": 0.7871428785949902, + "step": 4600 + }, + { + "epoch": 0.7402055234425177, + "grad_norm": 2.1390203235709064, + "learning_rate": 1.7012135480312453e-06, + "loss": 0.773406982421875, + "step": 4610 + }, + { + "epoch": 0.7418111753371869, + "grad_norm": 1.8394303048257645, + "learning_rate": 1.681513121275562e-06, + "loss": 0.6579567432403565, + "step": 4620 + }, + { + "epoch": 0.7434168272318561, + "grad_norm": 4.00766593758118, + "learning_rate": 1.6619043433718618e-06, + "loss": 0.6489538192749024, + "step": 4630 + }, + { + "epoch": 0.7450224791265254, + "grad_norm": 2.004303828563679, + "learning_rate": 1.6423877558687618e-06, + "loss": 0.7187282562255859, + "step": 4640 + }, + { + "epoch": 0.7466281310211946, + "grad_norm": 1.8011662444901102, + "learning_rate": 1.6229638977687978e-06, + "loss": 0.6926092624664306, + "step": 4650 + }, + { + "epoch": 0.7482337829158638, + "grad_norm": 1.857568101595114, + "learning_rate": 1.6036333055135345e-06, + "loss": 0.7091545104980469, + "step": 4660 + }, + { + "epoch": 0.7498394348105331, + "grad_norm": 1.5863365553976057, + "learning_rate": 1.5843965129687534e-06, + "loss": 0.6575593948364258, + "step": 4670 + }, + { + "epoch": 0.7514450867052023, + "grad_norm": 2.0852974372506323, + "learning_rate": 1.5652540514097053e-06, + "loss": 0.7059723854064941, + "step": 4680 + }, + { + "epoch": 0.7530507385998716, + "grad_norm": 1.6022043007467883, + "learning_rate": 1.5462064495064422e-06, + "loss": 0.7094513893127441, + "step": 4690 + }, + { + "epoch": 0.7546563904945408, + "grad_norm": 1.9880232886802098, + "learning_rate": 1.5272542333092111e-06, + "loss": 0.712027645111084, + "step": 4700 + }, + { + "epoch": 0.75626204238921, + "grad_norm": 1.8966672269171247, + "learning_rate": 1.5083979262339299e-06, + "loss": 0.7096713066101075, + "step": 4710 + }, + { + "epoch": 0.7578676942838792, + "grad_norm": 2.2560082777752926, + "learning_rate": 1.4896380490477336e-06, + "loss": 0.7394931793212891, + "step": 4720 + }, + { + "epoch": 0.7594733461785484, + "grad_norm": 1.6889658107554155, + "learning_rate": 1.4709751198545858e-06, + "loss": 0.6921211242675781, + "step": 4730 + }, + { + "epoch": 0.7610789980732178, + "grad_norm": 1.8873972073347112, + "learning_rate": 1.4524096540809746e-06, + "loss": 0.6788894653320312, + "step": 4740 + }, + { + "epoch": 0.762684649967887, + "grad_norm": 1.7372990869576996, + "learning_rate": 1.4339421644616723e-06, + "loss": 0.6684075355529785, + "step": 4750 + }, + { + "epoch": 0.7642903018625562, + "grad_norm": 1.8153713813071988, + "learning_rate": 1.415573161025584e-06, + "loss": 0.6402009010314942, + "step": 4760 + }, + { + "epoch": 0.7658959537572254, + "grad_norm": 1.659182519397587, + "learning_rate": 1.3973031510816542e-06, + "loss": 0.6899700164794922, + "step": 4770 + }, + { + "epoch": 0.7675016056518946, + "grad_norm": 1.871741870915225, + "learning_rate": 1.3791326392048593e-06, + "loss": 0.7124199867248535, + "step": 4780 + }, + { + "epoch": 0.769107257546564, + "grad_norm": 1.9475717396141488, + "learning_rate": 1.3610621272222713e-06, + "loss": 0.6798200607299805, + "step": 4790 + }, + { + "epoch": 0.7707129094412332, + "grad_norm": 1.609026845213952, + "learning_rate": 1.3430921141991977e-06, + "loss": 0.701236629486084, + "step": 4800 + }, + { + "epoch": 0.7707129094412332, + "eval_loss": 0.637983500957489, + "eval_runtime": 101.3114, + "eval_samples_per_second": 19.968, + "eval_steps_per_second": 4.995, + "eval_token_acc": 0.7876934833425379, + "step": 4800 + }, + { + "epoch": 0.7723185613359024, + "grad_norm": 1.9809938227068873, + "learning_rate": 1.3252230964253998e-06, + "loss": 0.6687247276306152, + "step": 4810 + }, + { + "epoch": 0.7739242132305716, + "grad_norm": 1.5879163475967482, + "learning_rate": 1.3074555674013901e-06, + "loss": 0.6856508255004883, + "step": 4820 + }, + { + "epoch": 0.7755298651252408, + "grad_norm": 1.9328904935598028, + "learning_rate": 1.2897900178247945e-06, + "loss": 0.7378098487854003, + "step": 4830 + }, + { + "epoch": 0.7771355170199101, + "grad_norm": 1.8830227485438022, + "learning_rate": 1.2722269355768058e-06, + "loss": 0.6985198020935058, + "step": 4840 + }, + { + "epoch": 0.7787411689145793, + "grad_norm": 1.848123076439394, + "learning_rate": 1.2547668057087097e-06, + "loss": 0.6710779666900635, + "step": 4850 + }, + { + "epoch": 0.7803468208092486, + "grad_norm": 2.004806413254504, + "learning_rate": 1.237410110428487e-06, + "loss": 0.711176586151123, + "step": 4860 + }, + { + "epoch": 0.7819524727039178, + "grad_norm": 2.2065298885710942, + "learning_rate": 1.2201573290874963e-06, + "loss": 0.6620469093322754, + "step": 4870 + }, + { + "epoch": 0.783558124598587, + "grad_norm": 1.8575907718766689, + "learning_rate": 1.2030089381672384e-06, + "loss": 0.7247062206268311, + "step": 4880 + }, + { + "epoch": 0.7851637764932563, + "grad_norm": 1.8829745520117143, + "learning_rate": 1.1859654112661923e-06, + "loss": 0.7227104187011719, + "step": 4890 + }, + { + "epoch": 0.7867694283879255, + "grad_norm": 1.9172890092540606, + "learning_rate": 1.169027219086739e-06, + "loss": 0.7319454669952392, + "step": 4900 + }, + { + "epoch": 0.7883750802825947, + "grad_norm": 1.6495086584014267, + "learning_rate": 1.1521948294221603e-06, + "loss": 0.7280515193939209, + "step": 4910 + }, + { + "epoch": 0.789980732177264, + "grad_norm": 1.8581360269695162, + "learning_rate": 1.1354687071437197e-06, + "loss": 0.6312118530273437, + "step": 4920 + }, + { + "epoch": 0.7915863840719332, + "grad_norm": 1.7869920126685614, + "learning_rate": 1.1188493141878248e-06, + "loss": 0.6938999176025391, + "step": 4930 + }, + { + "epoch": 0.7931920359666025, + "grad_norm": 1.6368096604874645, + "learning_rate": 1.1023371095432656e-06, + "loss": 0.7159996032714844, + "step": 4940 + }, + { + "epoch": 0.7947976878612717, + "grad_norm": 1.8355082872813364, + "learning_rate": 1.085932549238547e-06, + "loss": 0.7142012596130372, + "step": 4950 + }, + { + "epoch": 0.7964033397559409, + "grad_norm": 1.7194268581182632, + "learning_rate": 1.0696360863292842e-06, + "loss": 0.7176544189453125, + "step": 4960 + }, + { + "epoch": 0.7980089916506101, + "grad_norm": 2.2633388706554216, + "learning_rate": 1.053448170885697e-06, + "loss": 0.6689523220062256, + "step": 4970 + }, + { + "epoch": 0.7996146435452793, + "grad_norm": 1.877683615965777, + "learning_rate": 1.0373692499801763e-06, + "loss": 0.7301874160766602, + "step": 4980 + }, + { + "epoch": 0.8012202954399487, + "grad_norm": 1.5723248207516158, + "learning_rate": 1.021399767674941e-06, + "loss": 0.6963142395019531, + "step": 4990 + }, + { + "epoch": 0.8028259473346179, + "grad_norm": 2.210852669764262, + "learning_rate": 1.0055401650097685e-06, + "loss": 0.7632877349853515, + "step": 5000 + }, + { + "epoch": 0.8028259473346179, + "eval_loss": 0.6379601955413818, + "eval_runtime": 100.435, + "eval_samples_per_second": 20.142, + "eval_steps_per_second": 5.038, + "eval_token_acc": 0.7878169522859274, + "step": 5000 + }, + { + "epoch": 0.8044315992292871, + "grad_norm": 1.6585137655893314, + "learning_rate": 9.89790879989821e-07, + "loss": 0.7174354553222656, + "step": 5010 + }, + { + "epoch": 0.8060372511239563, + "grad_norm": 1.794298397140435, + "learning_rate": 9.741523475735414e-07, + "loss": 0.7475096702575683, + "step": 5020 + }, + { + "epoch": 0.8076429030186255, + "grad_norm": 1.7840894537770773, + "learning_rate": 9.586249996606473e-07, + "loss": 0.6916313648223877, + "step": 5030 + }, + { + "epoch": 0.8092485549132948, + "grad_norm": 1.6625161110070892, + "learning_rate": 9.432092650801994e-07, + "loss": 0.7148942470550537, + "step": 5040 + }, + { + "epoch": 0.810854206807964, + "grad_norm": 1.8916186866947355, + "learning_rate": 9.279055695787582e-07, + "loss": 0.6605861186981201, + "step": 5050 + }, + { + "epoch": 0.8124598587026333, + "grad_norm": 1.7413837106014958, + "learning_rate": 9.127143358086277e-07, + "loss": 0.6902444362640381, + "step": 5060 + }, + { + "epoch": 0.8140655105973025, + "grad_norm": 1.5421682897627211, + "learning_rate": 8.976359833161796e-07, + "loss": 0.6847622871398926, + "step": 5070 + }, + { + "epoch": 0.8156711624919717, + "grad_norm": 1.6388970285941844, + "learning_rate": 8.826709285302737e-07, + "loss": 0.7044576644897461, + "step": 5080 + }, + { + "epoch": 0.817276814386641, + "grad_norm": 2.1860191201925963, + "learning_rate": 8.678195847507464e-07, + "loss": 0.7263412475585938, + "step": 5090 + }, + { + "epoch": 0.8188824662813102, + "grad_norm": 1.840780268433954, + "learning_rate": 8.530823621370043e-07, + "loss": 0.6977847576141357, + "step": 5100 + }, + { + "epoch": 0.8204881181759794, + "grad_norm": 1.681597730618285, + "learning_rate": 8.384596676966938e-07, + "loss": 0.6181282997131348, + "step": 5110 + }, + { + "epoch": 0.8220937700706487, + "grad_norm": 1.8111683293074319, + "learning_rate": 8.239519052744605e-07, + "loss": 0.6957583427429199, + "step": 5120 + }, + { + "epoch": 0.8236994219653179, + "grad_norm": 2.018101916346725, + "learning_rate": 8.095594755407971e-07, + "loss": 0.7490185260772705, + "step": 5130 + }, + { + "epoch": 0.8253050738599872, + "grad_norm": 1.512552912962594, + "learning_rate": 7.952827759809756e-07, + "loss": 0.6976777076721191, + "step": 5140 + }, + { + "epoch": 0.8269107257546564, + "grad_norm": 1.8005082781446453, + "learning_rate": 7.811222008840719e-07, + "loss": 0.7236599922180176, + "step": 5150 + }, + { + "epoch": 0.8285163776493256, + "grad_norm": 1.9947903621763263, + "learning_rate": 7.670781413320766e-07, + "loss": 0.6453072547912597, + "step": 5160 + }, + { + "epoch": 0.8301220295439948, + "grad_norm": 1.749286146781592, + "learning_rate": 7.531509851890911e-07, + "loss": 0.6826435089111328, + "step": 5170 + }, + { + "epoch": 0.831727681438664, + "grad_norm": 1.8798572056617036, + "learning_rate": 7.393411170906201e-07, + "loss": 0.6848629474639892, + "step": 5180 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.6109675391338243, + "learning_rate": 7.256489184329452e-07, + "loss": 0.6714759349822998, + "step": 5190 + }, + { + "epoch": 0.8349389852280026, + "grad_norm": 1.6184688330019217, + "learning_rate": 7.120747673625916e-07, + "loss": 0.6852529525756836, + "step": 5200 + }, + { + "epoch": 0.8349389852280026, + "eval_loss": 0.6380971074104309, + "eval_runtime": 100.8172, + "eval_samples_per_second": 20.066, + "eval_steps_per_second": 5.019, + "eval_token_acc": 0.7881940331130358, + "step": 5200 + }, + { + "epoch": 0.8365446371226718, + "grad_norm": 1.7911508916416556, + "learning_rate": 6.986190387658909e-07, + "loss": 0.7391942024230957, + "step": 5210 + }, + { + "epoch": 0.838150289017341, + "grad_norm": 1.7945846037138906, + "learning_rate": 6.852821042586183e-07, + "loss": 0.7381104469299317, + "step": 5220 + }, + { + "epoch": 0.8397559409120102, + "grad_norm": 1.7286241666325148, + "learning_rate": 6.720643321757348e-07, + "loss": 0.7324903011322021, + "step": 5230 + }, + { + "epoch": 0.8413615928066795, + "grad_norm": 1.8638958372473948, + "learning_rate": 6.589660875612147e-07, + "loss": 0.720820140838623, + "step": 5240 + }, + { + "epoch": 0.8429672447013488, + "grad_norm": 2.2946259368571535, + "learning_rate": 6.459877321579628e-07, + "loss": 0.7330772876739502, + "step": 5250 + }, + { + "epoch": 0.844572896596018, + "grad_norm": 2.024802800575958, + "learning_rate": 6.33129624397823e-07, + "loss": 0.7107767581939697, + "step": 5260 + }, + { + "epoch": 0.8461785484906872, + "grad_norm": 1.700416273773183, + "learning_rate": 6.203921193916812e-07, + "loss": 0.637398099899292, + "step": 5270 + }, + { + "epoch": 0.8477842003853564, + "grad_norm": 2.0030457735835228, + "learning_rate": 6.077755689196574e-07, + "loss": 0.714143180847168, + "step": 5280 + }, + { + "epoch": 0.8493898522800257, + "grad_norm": 1.9186452668781833, + "learning_rate": 5.952803214213887e-07, + "loss": 0.6865103721618653, + "step": 5290 + }, + { + "epoch": 0.8509955041746949, + "grad_norm": 1.77904181814012, + "learning_rate": 5.829067219864099e-07, + "loss": 0.7384478092193604, + "step": 5300 + }, + { + "epoch": 0.8526011560693642, + "grad_norm": 1.5991572147517608, + "learning_rate": 5.706551123446175e-07, + "loss": 0.6412908554077148, + "step": 5310 + }, + { + "epoch": 0.8542068079640334, + "grad_norm": 1.8024184059984696, + "learning_rate": 5.585258308568381e-07, + "loss": 0.6989157676696778, + "step": 5320 + }, + { + "epoch": 0.8558124598587026, + "grad_norm": 2.0451623196060917, + "learning_rate": 5.465192125054769e-07, + "loss": 0.6701858043670654, + "step": 5330 + }, + { + "epoch": 0.8574181117533719, + "grad_norm": 1.8934593514894456, + "learning_rate": 5.346355888852767e-07, + "loss": 0.6798998832702636, + "step": 5340 + }, + { + "epoch": 0.8590237636480411, + "grad_norm": 1.9778835857420947, + "learning_rate": 5.22875288194144e-07, + "loss": 0.657193374633789, + "step": 5350 + }, + { + "epoch": 0.8606294155427103, + "grad_norm": 1.7955267532960553, + "learning_rate": 5.112386352241017e-07, + "loss": 0.6918414115905762, + "step": 5360 + }, + { + "epoch": 0.8622350674373795, + "grad_norm": 1.6988172442673055, + "learning_rate": 4.997259513523079e-07, + "loss": 0.7405441284179688, + "step": 5370 + }, + { + "epoch": 0.8638407193320488, + "grad_norm": 1.9188503717325887, + "learning_rate": 4.883375545321845e-07, + "loss": 0.6795600891113281, + "step": 5380 + }, + { + "epoch": 0.8654463712267181, + "grad_norm": 1.5981461088093833, + "learning_rate": 4.770737592846375e-07, + "loss": 0.6697923183441162, + "step": 5390 + }, + { + "epoch": 0.8670520231213873, + "grad_norm": 1.8377885982974524, + "learning_rate": 4.6593487668936565e-07, + "loss": 0.7350438117980957, + "step": 5400 + }, + { + "epoch": 0.8670520231213873, + "eval_loss": 0.6342610716819763, + "eval_runtime": 100.1583, + "eval_samples_per_second": 20.198, + "eval_steps_per_second": 5.052, + "eval_token_acc": 0.7883325185495402, + "step": 5400 + }, + { + "epoch": 0.8686576750160565, + "grad_norm": 1.6968512863196625, + "learning_rate": 4.5492121437627433e-07, + "loss": 0.6635457038879394, + "step": 5410 + }, + { + "epoch": 0.8702633269107257, + "grad_norm": 2.0071517773652663, + "learning_rate": 4.440330765169765e-07, + "loss": 0.6638803958892823, + "step": 5420 + }, + { + "epoch": 0.871868978805395, + "grad_norm": 1.731964996230591, + "learning_rate": 4.3327076381639357e-07, + "loss": 0.7365228176116944, + "step": 5430 + }, + { + "epoch": 0.8734746307000643, + "grad_norm": 2.0180621500310316, + "learning_rate": 4.226345735044485e-07, + "loss": 0.7146649837493897, + "step": 5440 + }, + { + "epoch": 0.8750802825947335, + "grad_norm": 1.852487136991446, + "learning_rate": 4.121247993278621e-07, + "loss": 0.7220353603363037, + "step": 5450 + }, + { + "epoch": 0.8766859344894027, + "grad_norm": 1.7475004626282986, + "learning_rate": 4.0174173154203356e-07, + "loss": 0.7063849449157715, + "step": 5460 + }, + { + "epoch": 0.8782915863840719, + "grad_norm": 1.722362531304578, + "learning_rate": 3.9148565690302896e-07, + "loss": 0.6868960857391357, + "step": 5470 + }, + { + "epoch": 0.8798972382787412, + "grad_norm": 1.7102032863217258, + "learning_rate": 3.813568586596611e-07, + "loss": 0.6587272644042969, + "step": 5480 + }, + { + "epoch": 0.8815028901734104, + "grad_norm": 1.4768286346198138, + "learning_rate": 3.7135561654566497e-07, + "loss": 0.7307673454284668, + "step": 5490 + }, + { + "epoch": 0.8831085420680796, + "grad_norm": 1.7825391189854223, + "learning_rate": 3.6148220677197364e-07, + "loss": 0.6861472129821777, + "step": 5500 + }, + { + "epoch": 0.8847141939627489, + "grad_norm": 1.5463273855512805, + "learning_rate": 3.5173690201909084e-07, + "loss": 0.6802690029144287, + "step": 5510 + }, + { + "epoch": 0.8863198458574181, + "grad_norm": 1.8670581209467214, + "learning_rate": 3.4211997142955756e-07, + "loss": 0.7045941352844238, + "step": 5520 + }, + { + "epoch": 0.8879254977520874, + "grad_norm": 2.0779947935631022, + "learning_rate": 3.326316806005209e-07, + "loss": 0.6813518524169921, + "step": 5530 + }, + { + "epoch": 0.8895311496467566, + "grad_norm": 1.95188637541602, + "learning_rate": 3.2327229157639915e-07, + "loss": 0.671013069152832, + "step": 5540 + }, + { + "epoch": 0.8911368015414258, + "grad_norm": 1.7012921225510371, + "learning_rate": 3.1404206284164295e-07, + "loss": 0.695673131942749, + "step": 5550 + }, + { + "epoch": 0.892742453436095, + "grad_norm": 2.0319087769759774, + "learning_rate": 3.0494124931359834e-07, + "loss": 0.673918628692627, + "step": 5560 + }, + { + "epoch": 0.8943481053307643, + "grad_norm": 1.7787149579377735, + "learning_rate": 2.959701023354644e-07, + "loss": 0.7056601524353028, + "step": 5570 + }, + { + "epoch": 0.8959537572254336, + "grad_norm": 1.6803005634091956, + "learning_rate": 2.871288696693564e-07, + "loss": 0.711387300491333, + "step": 5580 + }, + { + "epoch": 0.8975594091201028, + "grad_norm": 2.038872846650325, + "learning_rate": 2.7841779548945626e-07, + "loss": 0.643738079071045, + "step": 5590 + }, + { + "epoch": 0.899165061014772, + "grad_norm": 1.7323673485398343, + "learning_rate": 2.698371203752753e-07, + "loss": 0.6891765594482422, + "step": 5600 + }, + { + "epoch": 0.899165061014772, + "eval_loss": 0.6369297504425049, + "eval_runtime": 100.171, + "eval_samples_per_second": 20.195, + "eval_steps_per_second": 5.051, + "eval_token_acc": 0.7884993684730396, + "step": 5600 + }, + { + "epoch": 0.9007707129094412, + "grad_norm": 1.7595790189861171, + "learning_rate": 2.613870813050051e-07, + "loss": 0.6577945232391358, + "step": 5610 + }, + { + "epoch": 0.9023763648041104, + "grad_norm": 1.9701548509943287, + "learning_rate": 2.53067911648977e-07, + "loss": 0.7041536331176758, + "step": 5620 + }, + { + "epoch": 0.9039820166987798, + "grad_norm": 1.8393776355225353, + "learning_rate": 2.4487984116321474e-07, + "loss": 0.7007421493530274, + "step": 5630 + }, + { + "epoch": 0.905587668593449, + "grad_norm": 1.8071384165730564, + "learning_rate": 2.368230959830875e-07, + "loss": 0.6663236618041992, + "step": 5640 + }, + { + "epoch": 0.9071933204881182, + "grad_norm": 1.7242592136590722, + "learning_rate": 2.2889789861706868e-07, + "loss": 0.6813517570495605, + "step": 5650 + }, + { + "epoch": 0.9087989723827874, + "grad_norm": 3.112061318957083, + "learning_rate": 2.211044679405877e-07, + "loss": 0.663045072555542, + "step": 5660 + }, + { + "epoch": 0.9104046242774566, + "grad_norm": 1.615986400574688, + "learning_rate": 2.1344301918998555e-07, + "loss": 0.6766707420349121, + "step": 5670 + }, + { + "epoch": 0.9120102761721259, + "grad_norm": 1.7528874633834992, + "learning_rate": 2.059137639565717e-07, + "loss": 0.6877583503723145, + "step": 5680 + }, + { + "epoch": 0.9136159280667951, + "grad_norm": 1.7796225000122403, + "learning_rate": 1.9851691018077824e-07, + "loss": 0.6933162689208985, + "step": 5690 + }, + { + "epoch": 0.9152215799614644, + "grad_norm": 1.7019095020573654, + "learning_rate": 1.9125266214642e-07, + "loss": 0.7472042083740235, + "step": 5700 + }, + { + "epoch": 0.9168272318561336, + "grad_norm": 1.9587898193737927, + "learning_rate": 1.8412122047505032e-07, + "loss": 0.6880541801452636, + "step": 5710 + }, + { + "epoch": 0.9184328837508028, + "grad_norm": 1.6775853695124985, + "learning_rate": 1.7712278212042134e-07, + "loss": 0.7018757820129394, + "step": 5720 + }, + { + "epoch": 0.9200385356454721, + "grad_norm": 1.7798533051734553, + "learning_rate": 1.7025754036304466e-07, + "loss": 0.691845703125, + "step": 5730 + }, + { + "epoch": 0.9216441875401413, + "grad_norm": 1.7420651980828974, + "learning_rate": 1.6352568480485277e-07, + "loss": 0.6812302112579346, + "step": 5740 + }, + { + "epoch": 0.9232498394348105, + "grad_norm": 1.9239749814632983, + "learning_rate": 1.5692740136396324e-07, + "loss": 0.6581586837768555, + "step": 5750 + }, + { + "epoch": 0.9248554913294798, + "grad_norm": 1.9082994117549892, + "learning_rate": 1.5046287226954394e-07, + "loss": 0.731188154220581, + "step": 5760 + }, + { + "epoch": 0.926461143224149, + "grad_norm": 1.7926613214649838, + "learning_rate": 1.4413227605677983e-07, + "loss": 0.6892642021179199, + "step": 5770 + }, + { + "epoch": 0.9280667951188183, + "grad_norm": 1.771249909731025, + "learning_rate": 1.379357875619436e-07, + "loss": 0.6990791320800781, + "step": 5780 + }, + { + "epoch": 0.9296724470134875, + "grad_norm": 1.8356150776446425, + "learning_rate": 1.3187357791756504e-07, + "loss": 0.6910379886627197, + "step": 5790 + }, + { + "epoch": 0.9312780989081567, + "grad_norm": 1.705632884206152, + "learning_rate": 1.2594581454770772e-07, + "loss": 0.6509772300720215, + "step": 5800 + }, + { + "epoch": 0.9312780989081567, + "eval_loss": 0.631990373134613, + "eval_runtime": 101.722, + "eval_samples_per_second": 19.888, + "eval_steps_per_second": 4.974, + "eval_token_acc": 0.7884676669875746, + "step": 5800 + }, + { + "epoch": 0.9328837508028259, + "grad_norm": 1.7844388380004357, + "learning_rate": 1.2015266116334135e-07, + "loss": 0.668633222579956, + "step": 5810 + }, + { + "epoch": 0.9344894026974951, + "grad_norm": 1.733170437170936, + "learning_rate": 1.1449427775782396e-07, + "loss": 0.655826187133789, + "step": 5820 + }, + { + "epoch": 0.9360950545921645, + "grad_norm": 2.358368796318552, + "learning_rate": 1.0897082060247976e-07, + "loss": 0.7064967155456543, + "step": 5830 + }, + { + "epoch": 0.9377007064868337, + "grad_norm": 1.729089971309778, + "learning_rate": 1.0358244224228764e-07, + "loss": 0.6829689979553223, + "step": 5840 + }, + { + "epoch": 0.9393063583815029, + "grad_norm": 1.6455732105763972, + "learning_rate": 9.832929149166503e-08, + "loss": 0.6565438747406006, + "step": 5850 + }, + { + "epoch": 0.9409120102761721, + "grad_norm": 1.760683614917597, + "learning_rate": 9.32115134303574e-08, + "loss": 0.6851202487945557, + "step": 5860 + }, + { + "epoch": 0.9425176621708413, + "grad_norm": 1.7874937383563934, + "learning_rate": 8.822924939943523e-08, + "loss": 0.7376208782196045, + "step": 5870 + }, + { + "epoch": 0.9441233140655106, + "grad_norm": 1.6510863458242004, + "learning_rate": 8.338263699738668e-08, + "loss": 0.6026273250579834, + "step": 5880 + }, + { + "epoch": 0.9457289659601799, + "grad_norm": 1.9834215383922407, + "learning_rate": 7.867181007631897e-08, + "loss": 0.7061845779418945, + "step": 5890 + }, + { + "epoch": 0.9473346178548491, + "grad_norm": 1.9131535301068874, + "learning_rate": 7.409689873826232e-08, + "loss": 0.670206880569458, + "step": 5900 + }, + { + "epoch": 0.9489402697495183, + "grad_norm": 2.0738608371934237, + "learning_rate": 6.965802933157573e-08, + "loss": 0.708193302154541, + "step": 5910 + }, + { + "epoch": 0.9505459216441875, + "grad_norm": 1.8177322993411624, + "learning_rate": 6.535532444745862e-08, + "loss": 0.7078551292419434, + "step": 5920 + }, + { + "epoch": 0.9521515735388568, + "grad_norm": 2.082436198696705, + "learning_rate": 6.118890291656355e-08, + "loss": 0.7242929458618164, + "step": 5930 + }, + { + "epoch": 0.953757225433526, + "grad_norm": 1.849544928484393, + "learning_rate": 5.7158879805716e-08, + "loss": 0.6365905284881592, + "step": 5940 + }, + { + "epoch": 0.9553628773281952, + "grad_norm": 1.9831271095863907, + "learning_rate": 5.32653664147359e-08, + "loss": 0.6500433444976806, + "step": 5950 + }, + { + "epoch": 0.9569685292228645, + "grad_norm": 1.823333486059779, + "learning_rate": 4.950847027336336e-08, + "loss": 0.6756103038787842, + "step": 5960 + }, + { + "epoch": 0.9585741811175337, + "grad_norm": 1.8028664132327312, + "learning_rate": 4.588829513828996e-08, + "loss": 0.7307151794433594, + "step": 5970 + }, + { + "epoch": 0.960179833012203, + "grad_norm": 1.656799725101945, + "learning_rate": 4.2404940990292135e-08, + "loss": 0.6212584495544433, + "step": 5980 + }, + { + "epoch": 0.9617854849068722, + "grad_norm": 1.9150059741320609, + "learning_rate": 3.90585040314706e-08, + "loss": 0.6227765560150147, + "step": 5990 + }, + { + "epoch": 0.9633911368015414, + "grad_norm": 1.880228879847369, + "learning_rate": 3.584907668259308e-08, + "loss": 0.7474319934844971, + "step": 6000 + }, + { + "epoch": 0.9633911368015414, + "eval_loss": 0.6360629200935364, + "eval_runtime": 100.1142, + "eval_samples_per_second": 20.207, + "eval_steps_per_second": 5.054, + "eval_token_acc": 0.7885460864516194, + "step": 6000 + } + ], + "logging_steps": 10, + "max_steps": 6228, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 185011340083200.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..707fb79 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee0c1d8e961c2ab4483ff0e3d8e1be0df562d9d975ba10fd1335bea196addbec +size 9425 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833