commit a304e4639d7ba7a93f7c647b832c2880540a3ead Author: ModelHub XC Date: Fri May 22 20:30:13 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: mlfoundations-cua-dev/qwen2_5vl_3b_sft_unified_idm_data Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..01dbbdc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,55 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f051e7d --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +library_name: transformers +license: other +base_model: Qwen/Qwen2.5-VL-3B-Instruct +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: qwen2_5vl_3b_sft_unified_idm_data + results: [] +--- + + + +# qwen2_5vl_3b_sft_unified_idm_data + +This model is a fine-tuned version of [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) on the unified_idm_data dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 4 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- total_train_batch_size: 32 +- total_eval_batch_size: 64 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 1.0 + +### Training results + + + +### Framework versions + +- Transformers 4.52.4 +- Pytorch 2.5.1+cu121 +- Datasets 3.6.0 +- Tokenizers 0.21.1 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..482ced4 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,24 @@ +{ + "": 151658, + "": 151657, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..87976ac --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 1.9483786471612088e+18, + "train_loss": 0.10519510776023655, + "train_runtime": 10705.8335, + "train_samples_per_second": 3.25, + "train_steps_per_second": 0.102 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..6c22663 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,7 @@ +{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system +You are a helpful assistant.<|im_end|> +{% endif %}<|im_start|>{{ message['role'] }} +{% if message['content'] is string %}{{ message['content'] }}<|im_end|> +{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|> +{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant +{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..d68b631 --- /dev/null +++ b/config.json @@ -0,0 +1,105 @@ +{ + "architectures": [ + "Qwen2_5_VLForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 2048, + "image_token_id": 151655, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 128000, + "max_window_layers": 70, + "model_type": "qwen2_5_vl", + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "text_config": { + "architectures": [ + "Qwen2_5_VLForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 2048, + "image_token_id": null, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 128000, + "max_window_layers": 70, + "model_type": "qwen2_5_vl_text", + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "use_cache": false, + "use_sliding_window": false, + "video_token_id": null, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.4", + "use_cache": false, + "use_sliding_window": false, + "video_token_id": 151656, + "vision_config": { + "depth": 32, + "fullatt_block_indexes": [ + 7, + 15, + 23, + 31 + ], + "hidden_act": "silu", + "hidden_size": 1280, + "in_channels": 3, + "in_chans": 3, + "initializer_range": 0.02, + "intermediate_size": 3420, + "model_type": "qwen2_5_vl", + "num_heads": 16, + "out_hidden_size": 2048, + "patch_size": 14, + "spatial_merge_size": 2, + "spatial_patch_size": 14, + "temporal_patch_size": 2, + "tokens_per_second": 2, + "torch_dtype": "bfloat16", + "window_size": 112 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..273dbe0 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "image-to-text", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c110271 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 1e-06, + "transformers_version": "4.52.4" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..7844ff5 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c93956f0ce5e0e76e85ef4d0cab2affd2e4f692a2113eddc5e2b45187ac866 +size 4958351144 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..610620a --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:762ff8824f954f7cc0884a3e1d0ae361b0b3edc427a30488a0ff4cd807fc5371 +size 4932949296 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..13107da --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db99657557b789ec2586688ffaaafa4aea784a95c89c2b531aa9b9183be87122 +size 3789914528 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..007c03a --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,831 @@ +{ + "metadata": { + "total_size": 13681123328 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors", + "visual.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.1.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.1.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.1.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.10.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.10.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.10.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.11.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.11.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.11.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.12.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.12.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.12.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.13.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.13.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.13.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.14.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.14.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.14.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.15.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.15.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.15.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.16.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.16.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.16.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.17.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.17.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.17.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.18.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.18.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.18.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.19.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.19.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.19.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.2.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.2.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.2.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.20.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.20.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.20.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.21.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.21.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.21.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.22.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.22.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.22.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.23.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.23.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.23.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.24.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.24.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.24.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.25.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.25.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.25.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.26.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.26.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.26.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.27.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.27.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.27.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.28.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.28.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.28.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.29.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.29.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.29.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.3.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.3.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.3.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.30.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.30.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.30.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.31.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.31.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.31.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.4.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.4.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.4.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.5.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.5.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.5.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.6.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.6.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.6.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.7.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.7.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.7.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.8.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.8.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.8.norm2.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.attn.proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.9.attn.proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.attn.qkv.bias": "model-00001-of-00003.safetensors", + "visual.blocks.9.attn.qkv.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00003.safetensors", + "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.norm1.weight": "model-00001-of-00003.safetensors", + "visual.blocks.9.norm2.weight": "model-00001-of-00003.safetensors", + "visual.merger.ln_q.weight": "model-00001-of-00003.safetensors", + "visual.merger.mlp.0.bias": "model-00001-of-00003.safetensors", + "visual.merger.mlp.0.weight": "model-00001-of-00003.safetensors", + "visual.merger.mlp.2.bias": "model-00001-of-00003.safetensors", + "visual.merger.mlp.2.weight": "model-00001-of-00003.safetensors", + "visual.patch_embed.proj.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..1c234b7 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,36 @@ +{ + "crop_size": null, + "data_format": "channels_first", + "default_to_square": true, + "device": null, + "do_center_crop": null, + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "Qwen2VLImageProcessorFast", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "input_data_format": null, + "max_pixels": 12845056, + "merge_size": 2, + "min_pixels": 3136, + "patch_size": 14, + "processor_class": "Qwen2_5_VLProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_tensors": null, + "size": { + "longest_edge": 12845056, + "shortest_edge": 3136 + }, + "temporal_patch_size": 2 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..51ebb3b --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa +size 11421896 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..230f071 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,209 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "processor_class": "Qwen2_5_VLProcessor", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..87976ac --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 1.9483786471612088e+18, + "train_loss": 0.10519510776023655, + "train_runtime": 10705.8335, + "train_samples_per_second": 3.25, + "train_steps_per_second": 0.102 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..71d5e2e --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,109 @@ +{"current_steps": 10, "total_steps": 1088, "loss": 0.578, "lr": 8.256880733944956e-07, "epoch": 0.009191176470588236, "percentage": 0.92, "elapsed_time": "0:02:06", "remaining_time": "3:46:44"} +{"current_steps": 20, "total_steps": 1088, "loss": 0.4698, "lr": 1.743119266055046e-06, "epoch": 0.01838235294117647, "percentage": 1.84, "elapsed_time": "0:03:46", "remaining_time": "3:21:24"} +{"current_steps": 30, "total_steps": 1088, "loss": 0.254, "lr": 2.6605504587155968e-06, "epoch": 0.027573529411764705, "percentage": 2.76, "elapsed_time": "0:05:26", "remaining_time": "3:12:05"} +{"current_steps": 40, "total_steps": 1088, "loss": 0.1899, "lr": 3.5779816513761473e-06, "epoch": 0.03676470588235294, "percentage": 3.68, "elapsed_time": "0:07:05", "remaining_time": "3:05:44"} +{"current_steps": 50, "total_steps": 1088, "loss": 0.1712, "lr": 4.4954128440366975e-06, "epoch": 0.04595588235294118, "percentage": 4.6, "elapsed_time": "0:08:46", "remaining_time": "3:02:04"} +{"current_steps": 60, "total_steps": 1088, "loss": 0.1554, "lr": 5.412844036697248e-06, "epoch": 0.05514705882352941, "percentage": 5.51, "elapsed_time": "0:10:23", "remaining_time": "2:58:02"} +{"current_steps": 70, "total_steps": 1088, "loss": 0.1509, "lr": 6.330275229357799e-06, "epoch": 0.06433823529411764, "percentage": 6.43, "elapsed_time": "0:11:59", "remaining_time": "2:54:29"} +{"current_steps": 80, "total_steps": 1088, "loss": 0.144, "lr": 7.247706422018349e-06, "epoch": 0.07352941176470588, "percentage": 7.35, "elapsed_time": "0:13:36", "remaining_time": "2:51:26"} +{"current_steps": 90, "total_steps": 1088, "loss": 0.1377, "lr": 8.1651376146789e-06, "epoch": 0.08272058823529412, "percentage": 8.27, "elapsed_time": "0:15:15", "remaining_time": "2:49:07"} +{"current_steps": 100, "total_steps": 1088, "loss": 0.128, "lr": 9.08256880733945e-06, "epoch": 0.09191176470588236, "percentage": 9.19, "elapsed_time": "0:16:51", "remaining_time": "2:46:35"} +{"current_steps": 110, "total_steps": 1088, "loss": 0.1237, "lr": 1e-05, "epoch": 0.10110294117647059, "percentage": 10.11, "elapsed_time": "0:18:28", "remaining_time": "2:44:15"} +{"current_steps": 120, "total_steps": 1088, "loss": 0.1317, "lr": 9.99742583072674e-06, "epoch": 0.11029411764705882, "percentage": 11.03, "elapsed_time": "0:20:04", "remaining_time": "2:41:58"} +{"current_steps": 130, "total_steps": 1088, "loss": 0.1258, "lr": 9.98970597344593e-06, "epoch": 0.11948529411764706, "percentage": 11.95, "elapsed_time": "0:21:41", "remaining_time": "2:39:51"} +{"current_steps": 140, "total_steps": 1088, "loss": 0.1259, "lr": 9.976848377045343e-06, "epoch": 0.12867647058823528, "percentage": 12.87, "elapsed_time": "0:23:24", "remaining_time": "2:38:28"} +{"current_steps": 150, "total_steps": 1088, "loss": 0.119, "lr": 9.958866280576803e-06, "epoch": 0.13786764705882354, "percentage": 13.79, "elapsed_time": "0:25:00", "remaining_time": "2:36:24"} +{"current_steps": 160, "total_steps": 1088, "loss": 0.1193, "lr": 9.935778199624394e-06, "epoch": 0.14705882352941177, "percentage": 14.71, "elapsed_time": "0:26:37", "remaining_time": "2:34:25"} +{"current_steps": 170, "total_steps": 1088, "loss": 0.1159, "lr": 9.90760790723954e-06, "epoch": 0.15625, "percentage": 15.62, "elapsed_time": "0:28:14", "remaining_time": "2:32:27"} +{"current_steps": 180, "total_steps": 1088, "loss": 0.1198, "lr": 9.874384409462673e-06, "epoch": 0.16544117647058823, "percentage": 16.54, "elapsed_time": "0:29:50", "remaining_time": "2:30:32"} +{"current_steps": 190, "total_steps": 1088, "loss": 0.1109, "lr": 9.836141915456646e-06, "epoch": 0.17463235294117646, "percentage": 17.46, "elapsed_time": "0:31:27", "remaining_time": "2:28:38"} +{"current_steps": 200, "total_steps": 1088, "loss": 0.1158, "lr": 9.792919802282656e-06, "epoch": 0.18382352941176472, "percentage": 18.38, "elapsed_time": "0:33:03", "remaining_time": "2:26:46"} +{"current_steps": 210, "total_steps": 1088, "loss": 0.1148, "lr": 9.744762574354967e-06, "epoch": 0.19301470588235295, "percentage": 19.3, "elapsed_time": "0:34:39", "remaining_time": "2:24:55"} +{"current_steps": 220, "total_steps": 1088, "loss": 0.1054, "lr": 9.691719817616148e-06, "epoch": 0.20220588235294118, "percentage": 20.22, "elapsed_time": "0:36:16", "remaining_time": "2:23:07"} +{"current_steps": 230, "total_steps": 1088, "loss": 0.1097, "lr": 9.633846148480024e-06, "epoch": 0.2113970588235294, "percentage": 21.14, "elapsed_time": "0:37:52", "remaining_time": "2:21:18"} +{"current_steps": 240, "total_steps": 1088, "loss": 0.1055, "lr": 9.571201157594925e-06, "epoch": 0.22058823529411764, "percentage": 22.06, "elapsed_time": "0:39:29", "remaining_time": "2:19:30"} +{"current_steps": 250, "total_steps": 1088, "loss": 0.1046, "lr": 9.503849348485112e-06, "epoch": 0.22977941176470587, "percentage": 22.98, "elapsed_time": "0:41:05", "remaining_time": "2:17:44"} +{"current_steps": 260, "total_steps": 1088, "loss": 0.0973, "lr": 9.431860071133592e-06, "epoch": 0.23897058823529413, "percentage": 23.9, "elapsed_time": "0:42:42", "remaining_time": "2:15:59"} +{"current_steps": 270, "total_steps": 1088, "loss": 0.1032, "lr": 9.355307450574666e-06, "epoch": 0.24816176470588236, "percentage": 24.82, "elapsed_time": "0:44:24", "remaining_time": "2:14:33"} +{"current_steps": 280, "total_steps": 1088, "loss": 0.1057, "lr": 9.27427031056979e-06, "epoch": 0.25735294117647056, "percentage": 25.74, "elapsed_time": "0:46:01", "remaining_time": "2:12:50"} +{"current_steps": 290, "total_steps": 1088, "loss": 0.105, "lr": 9.188832092445281e-06, "epoch": 0.2665441176470588, "percentage": 26.65, "elapsed_time": "0:47:38", "remaining_time": "2:11:06"} +{"current_steps": 300, "total_steps": 1088, "loss": 0.1048, "lr": 9.09908076917548e-06, "epoch": 0.2757352941176471, "percentage": 27.57, "elapsed_time": "0:49:15", "remaining_time": "2:09:22"} +{"current_steps": 310, "total_steps": 1088, "loss": 0.1001, "lr": 9.00510875479983e-06, "epoch": 0.2849264705882353, "percentage": 28.49, "elapsed_time": "0:50:51", "remaining_time": "2:07:38"} +{"current_steps": 320, "total_steps": 1088, "loss": 0.1045, "lr": 8.907012809267107e-06, "epoch": 0.29411764705882354, "percentage": 29.41, "elapsed_time": "0:52:33", "remaining_time": "2:06:08"} +{"current_steps": 330, "total_steps": 1088, "loss": 0.1029, "lr": 8.804893938804839e-06, "epoch": 0.30330882352941174, "percentage": 30.33, "elapsed_time": "0:54:10", "remaining_time": "2:04:25"} +{"current_steps": 340, "total_steps": 1088, "loss": 0.1, "lr": 8.698857291916456e-06, "epoch": 0.3125, "percentage": 31.25, "elapsed_time": "0:55:46", "remaining_time": "2:02:42"} +{"current_steps": 350, "total_steps": 1088, "loss": 0.0923, "lr": 8.58901205111326e-06, "epoch": 0.32169117647058826, "percentage": 32.17, "elapsed_time": "0:57:25", "remaining_time": "2:01:05"} +{"current_steps": 360, "total_steps": 1088, "loss": 0.1128, "lr": 8.475471320492728e-06, "epoch": 0.33088235294117646, "percentage": 33.09, "elapsed_time": "0:59:02", "remaining_time": "1:59:24"} +{"current_steps": 370, "total_steps": 1088, "loss": 0.1044, "lr": 8.35835200927887e-06, "epoch": 0.3400735294117647, "percentage": 34.01, "elapsed_time": "1:00:39", "remaining_time": "1:57:42"} +{"current_steps": 380, "total_steps": 1088, "loss": 0.0983, "lr": 8.237774711444575e-06, "epoch": 0.3492647058823529, "percentage": 34.93, "elapsed_time": "1:02:15", "remaining_time": "1:56:00"} +{"current_steps": 390, "total_steps": 1088, "loss": 0.0908, "lr": 8.113863581539905e-06, "epoch": 0.3584558823529412, "percentage": 35.85, "elapsed_time": "1:03:52", "remaining_time": "1:54:18"} +{"current_steps": 400, "total_steps": 1088, "loss": 0.0939, "lr": 7.986746206854143e-06, "epoch": 0.36764705882352944, "percentage": 36.76, "elapsed_time": "1:05:28", "remaining_time": "1:52:37"} +{"current_steps": 410, "total_steps": 1088, "loss": 0.0967, "lr": 7.856553476043294e-06, "epoch": 0.37683823529411764, "percentage": 37.68, "elapsed_time": "1:07:07", "remaining_time": "1:50:59"} +{"current_steps": 420, "total_steps": 1088, "loss": 0.0953, "lr": 7.723419444358261e-06, "epoch": 0.3860294117647059, "percentage": 38.6, "elapsed_time": "1:08:43", "remaining_time": "1:49:18"} +{"current_steps": 430, "total_steps": 1088, "loss": 0.0928, "lr": 7.5874811956124805e-06, "epoch": 0.3952205882352941, "percentage": 39.52, "elapsed_time": "1:10:20", "remaining_time": "1:47:37"} +{"current_steps": 440, "total_steps": 1088, "loss": 0.0938, "lr": 7.4488787010311425e-06, "epoch": 0.40441176470588236, "percentage": 40.44, "elapsed_time": "1:11:56", "remaining_time": "1:45:57"} +{"current_steps": 450, "total_steps": 1088, "loss": 0.0925, "lr": 7.3077546751273494e-06, "epoch": 0.41360294117647056, "percentage": 41.36, "elapsed_time": "1:13:33", "remaining_time": "1:44:16"} +{"current_steps": 460, "total_steps": 1088, "loss": 0.0953, "lr": 7.164254428753581e-06, "epoch": 0.4227941176470588, "percentage": 42.28, "elapsed_time": "1:15:09", "remaining_time": "1:42:36"} +{"current_steps": 470, "total_steps": 1088, "loss": 0.0913, "lr": 7.018525719479805e-06, "epoch": 0.4319852941176471, "percentage": 43.2, "elapsed_time": "1:16:45", "remaining_time": "1:40:56"} +{"current_steps": 480, "total_steps": 1088, "loss": 0.0974, "lr": 6.870718599452279e-06, "epoch": 0.4411764705882353, "percentage": 44.12, "elapsed_time": "1:18:22", "remaining_time": "1:39:16"} +{"current_steps": 490, "total_steps": 1088, "loss": 0.0901, "lr": 6.7209852608897005e-06, "epoch": 0.45036764705882354, "percentage": 45.04, "elapsed_time": "1:19:58", "remaining_time": "1:37:36"} +{"current_steps": 500, "total_steps": 1088, "loss": 0.0936, "lr": 6.569479879375795e-06, "epoch": 0.45955882352941174, "percentage": 45.96, "elapsed_time": "1:21:35", "remaining_time": "1:35:56"} +{"current_steps": 510, "total_steps": 1088, "loss": 0.0933, "lr": 6.416358455109695e-06, "epoch": 0.46875, "percentage": 46.88, "elapsed_time": "1:24:06", "remaining_time": "1:35:19"} +{"current_steps": 520, "total_steps": 1088, "loss": 0.093, "lr": 6.261778652277565e-06, "epoch": 0.47794117647058826, "percentage": 47.79, "elapsed_time": "1:25:43", "remaining_time": "1:33:38"} +{"current_steps": 530, "total_steps": 1088, "loss": 0.0879, "lr": 6.105899636710895e-06, "epoch": 0.48713235294117646, "percentage": 48.71, "elapsed_time": "1:27:19", "remaining_time": "1:31:56"} +{"current_steps": 540, "total_steps": 1088, "loss": 0.0904, "lr": 5.948881911998572e-06, "epoch": 0.4963235294117647, "percentage": 49.63, "elapsed_time": "1:28:57", "remaining_time": "1:30:16"} +{"current_steps": 550, "total_steps": 1088, "loss": 0.0883, "lr": 5.790887154221521e-06, "epoch": 0.5055147058823529, "percentage": 50.55, "elapsed_time": "1:30:33", "remaining_time": "1:28:35"} +{"current_steps": 560, "total_steps": 1088, "loss": 0.0883, "lr": 5.632078045480065e-06, "epoch": 0.5147058823529411, "percentage": 51.47, "elapsed_time": "1:32:10", "remaining_time": "1:26:54"} +{"current_steps": 570, "total_steps": 1088, "loss": 0.0814, "lr": 5.472618106385415e-06, "epoch": 0.5238970588235294, "percentage": 52.39, "elapsed_time": "1:33:46", "remaining_time": "1:25:13"} +{"current_steps": 580, "total_steps": 1088, "loss": 0.0884, "lr": 5.31267152768779e-06, "epoch": 0.5330882352941176, "percentage": 53.31, "elapsed_time": "1:35:23", "remaining_time": "1:23:32"} +{"current_steps": 590, "total_steps": 1088, "loss": 0.0882, "lr": 5.152403001214483e-06, "epoch": 0.5422794117647058, "percentage": 54.23, "elapsed_time": "1:36:59", "remaining_time": "1:21:52"} +{"current_steps": 600, "total_steps": 1088, "loss": 0.0873, "lr": 4.991977550292028e-06, "epoch": 0.5514705882352942, "percentage": 55.15, "elapsed_time": "1:38:36", "remaining_time": "1:20:12"} +{"current_steps": 610, "total_steps": 1088, "loss": 0.086, "lr": 4.831560359826985e-06, "epoch": 0.5606617647058824, "percentage": 56.07, "elapsed_time": "1:40:12", "remaining_time": "1:18:31"} +{"current_steps": 620, "total_steps": 1088, "loss": 0.0905, "lr": 4.671316606220394e-06, "epoch": 0.5698529411764706, "percentage": 56.99, "elapsed_time": "1:41:49", "remaining_time": "1:16:51"} +{"current_steps": 630, "total_steps": 1088, "loss": 0.0881, "lr": 4.511411287290964e-06, "epoch": 0.5790441176470589, "percentage": 57.9, "elapsed_time": "1:43:25", "remaining_time": "1:15:11"} +{"current_steps": 640, "total_steps": 1088, "loss": 0.088, "lr": 4.35200905238214e-06, "epoch": 0.5882352941176471, "percentage": 58.82, "elapsed_time": "1:45:02", "remaining_time": "1:13:31"} +{"current_steps": 650, "total_steps": 1088, "loss": 0.0873, "lr": 4.193274032828e-06, "epoch": 0.5974264705882353, "percentage": 59.74, "elapsed_time": "1:46:38", "remaining_time": "1:11:51"} +{"current_steps": 660, "total_steps": 1088, "loss": 0.0878, "lr": 4.035369672952516e-06, "epoch": 0.6066176470588235, "percentage": 60.66, "elapsed_time": "1:48:15", "remaining_time": "1:10:12"} +{"current_steps": 670, "total_steps": 1088, "loss": 0.0849, "lr": 3.8784585617762084e-06, "epoch": 0.6158088235294118, "percentage": 61.58, "elapsed_time": "1:49:51", "remaining_time": "1:08:32"} +{"current_steps": 680, "total_steps": 1088, "loss": 0.0867, "lr": 3.7227022656034873e-06, "epoch": 0.625, "percentage": 62.5, "elapsed_time": "1:51:28", "remaining_time": "1:06:52"} +{"current_steps": 690, "total_steps": 1088, "loss": 0.0896, "lr": 3.568261161663042e-06, "epoch": 0.6341911764705882, "percentage": 63.42, "elapsed_time": "1:53:05", "remaining_time": "1:05:14"} +{"current_steps": 700, "total_steps": 1088, "loss": 0.0893, "lr": 3.4152942729725896e-06, "epoch": 0.6433823529411765, "percentage": 64.34, "elapsed_time": "1:54:42", "remaining_time": "1:03:35"} +{"current_steps": 710, "total_steps": 1088, "loss": 0.0861, "lr": 3.263959104598009e-06, "epoch": 0.6525735294117647, "percentage": 65.26, "elapsed_time": "1:56:19", "remaining_time": "1:01:55"} +{"current_steps": 720, "total_steps": 1088, "loss": 0.0776, "lr": 3.114411481475455e-06, "epoch": 0.6617647058823529, "percentage": 66.18, "elapsed_time": "1:57:55", "remaining_time": "1:00:16"} +{"current_steps": 730, "total_steps": 1088, "loss": 0.084, "lr": 2.966805387963463e-06, "epoch": 0.6709558823529411, "percentage": 67.1, "elapsed_time": "1:59:32", "remaining_time": "0:58:37"} +{"current_steps": 740, "total_steps": 1088, "loss": 0.0834, "lr": 2.821292809290217e-06, "epoch": 0.6801470588235294, "percentage": 68.01, "elapsed_time": "2:01:08", "remaining_time": "0:56:58"} +{"current_steps": 750, "total_steps": 1088, "loss": 0.083, "lr": 2.678023575059274e-06, "epoch": 0.6893382352941176, "percentage": 68.93, "elapsed_time": "2:02:45", "remaining_time": "0:55:19"} +{"current_steps": 760, "total_steps": 1088, "loss": 0.0842, "lr": 2.5371452049748603e-06, "epoch": 0.6985294117647058, "percentage": 69.85, "elapsed_time": "2:04:21", "remaining_time": "0:53:40"} +{"current_steps": 770, "total_steps": 1088, "loss": 0.0874, "lr": 2.3988027569455895e-06, "epoch": 0.7077205882352942, "percentage": 70.77, "elapsed_time": "2:05:58", "remaining_time": "0:52:01"} +{"current_steps": 780, "total_steps": 1088, "loss": 0.0815, "lr": 2.2631386777230248e-06, "epoch": 0.7169117647058824, "percentage": 71.69, "elapsed_time": "2:07:34", "remaining_time": "0:50:22"} +{"current_steps": 790, "total_steps": 1088, "loss": 0.0814, "lr": 2.130292656228856e-06, "epoch": 0.7261029411764706, "percentage": 72.61, "elapsed_time": "2:09:11", "remaining_time": "0:48:43"} +{"current_steps": 800, "total_steps": 1088, "loss": 0.0826, "lr": 2.0004014797217207e-06, "epoch": 0.7352941176470589, "percentage": 73.53, "elapsed_time": "2:10:47", "remaining_time": "0:47:05"} +{"current_steps": 810, "total_steps": 1088, "loss": 0.0851, "lr": 1.873598892951795e-06, "epoch": 0.7444852941176471, "percentage": 74.45, "elapsed_time": "2:12:23", "remaining_time": "0:45:26"} +{"current_steps": 820, "total_steps": 1088, "loss": 0.0804, "lr": 1.7500154604481312e-06, "epoch": 0.7536764705882353, "percentage": 75.37, "elapsed_time": "2:14:00", "remaining_time": "0:43:47"} +{"current_steps": 830, "total_steps": 1088, "loss": 0.084, "lr": 1.629778432080586e-06, "epoch": 0.7628676470588235, "percentage": 76.29, "elapsed_time": "2:15:37", "remaining_time": "0:42:09"} +{"current_steps": 840, "total_steps": 1088, "loss": 0.0776, "lr": 1.513011612034726e-06, "epoch": 0.7720588235294118, "percentage": 77.21, "elapsed_time": "2:17:13", "remaining_time": "0:40:30"} +{"current_steps": 850, "total_steps": 1088, "loss": 0.0797, "lr": 1.3998352313346768e-06, "epoch": 0.78125, "percentage": 78.12, "elapsed_time": "2:18:50", "remaining_time": "0:38:52"} +{"current_steps": 860, "total_steps": 1088, "loss": 0.0792, "lr": 1.2903658240450989e-06, "epoch": 0.7904411764705882, "percentage": 79.04, "elapsed_time": "2:20:26", "remaining_time": "0:37:14"} +{"current_steps": 870, "total_steps": 1088, "loss": 0.0804, "lr": 1.184716107279837e-06, "epoch": 0.7996323529411765, "percentage": 79.96, "elapsed_time": "2:22:03", "remaining_time": "0:35:35"} +{"current_steps": 880, "total_steps": 1088, "loss": 0.0817, "lr": 1.0829948651407374e-06, "epoch": 0.8088235294117647, "percentage": 80.88, "elapsed_time": "2:23:39", "remaining_time": "0:33:57"} +{"current_steps": 890, "total_steps": 1088, "loss": 0.0798, "lr": 9.85306836706184e-07, "epoch": 0.8180147058823529, "percentage": 81.8, "elapsed_time": "2:25:16", "remaining_time": "0:32:19"} +{"current_steps": 900, "total_steps": 1088, "loss": 0.0772, "lr": 8.917526081846411e-07, "epoch": 0.8272058823529411, "percentage": 82.72, "elapsed_time": "2:26:53", "remaining_time": "0:30:40"} +{"current_steps": 910, "total_steps": 1088, "loss": 0.0796, "lr": 8.024285093442874e-07, "epoch": 0.8363970588235294, "percentage": 83.64, "elapsed_time": "2:28:30", "remaining_time": "0:29:02"} +{"current_steps": 920, "total_steps": 1088, "loss": 0.0754, "lr": 7.17426514325359e-07, "epoch": 0.8455882352941176, "percentage": 84.56, "elapsed_time": "2:30:06", "remaining_time": "0:27:24"} +{"current_steps": 930, "total_steps": 1088, "loss": 0.0816, "lr": 6.36834146937354e-07, "epoch": 0.8547794117647058, "percentage": 85.48, "elapsed_time": "2:31:43", "remaining_time": "0:25:46"} +{"current_steps": 940, "total_steps": 1088, "loss": 0.077, "lr": 5.607343905385898e-07, "epoch": 0.8639705882352942, "percentage": 86.4, "elapsed_time": "2:33:19", "remaining_time": "0:24:08"} +{"current_steps": 950, "total_steps": 1088, "loss": 0.0788, "lr": 4.892056025909148e-07, "epoch": 0.8731617647058824, "percentage": 87.32, "elapsed_time": "2:34:56", "remaining_time": "0:22:30"} +{"current_steps": 960, "total_steps": 1088, "loss": 0.0761, "lr": 4.2232143397756607e-07, "epoch": 0.8823529411764706, "percentage": 88.24, "elapsed_time": "2:36:32", "remaining_time": "0:20:52"} +{"current_steps": 970, "total_steps": 1088, "loss": 0.0762, "lr": 3.6015075316722605e-07, "epoch": 0.8915441176470589, "percentage": 89.15, "elapsed_time": "2:38:09", "remaining_time": "0:19:14"} +{"current_steps": 980, "total_steps": 1088, "loss": 0.0827, "lr": 3.02757575302392e-07, "epoch": 0.9007352941176471, "percentage": 90.07, "elapsed_time": "2:39:45", "remaining_time": "0:17:36"} +{"current_steps": 990, "total_steps": 1088, "loss": 0.0786, "lr": 2.5020099628504603e-07, "epoch": 0.9099264705882353, "percentage": 90.99, "elapsed_time": "2:41:22", "remaining_time": "0:15:58"} +{"current_steps": 1000, "total_steps": 1088, "loss": 0.0713, "lr": 2.0253513192751374e-07, "epoch": 0.9191176470588235, "percentage": 91.91, "elapsed_time": "2:42:58", "remaining_time": "0:14:20"} +{"current_steps": 1010, "total_steps": 1088, "loss": 0.0823, "lr": 1.5980906223115933e-07, "epoch": 0.9283088235294118, "percentage": 92.83, "elapsed_time": "2:45:14", "remaining_time": "0:12:45"} +{"current_steps": 1020, "total_steps": 1088, "loss": 0.0797, "lr": 1.220667808502951e-07, "epoch": 0.9375, "percentage": 93.75, "elapsed_time": "2:46:51", "remaining_time": "0:11:07"} +{"current_steps": 1030, "total_steps": 1088, "loss": 0.0813, "lr": 8.934714979333403e-08, "epoch": 0.9466911764705882, "percentage": 94.67, "elapsed_time": "2:48:27", "remaining_time": "0:09:29"} +{"current_steps": 1040, "total_steps": 1088, "loss": 0.0745, "lr": 6.168385940783727e-08, "epoch": 0.9558823529411765, "percentage": 95.59, "elapsed_time": "2:50:04", "remaining_time": "0:07:50"} +{"current_steps": 1050, "total_steps": 1088, "loss": 0.0794, "lr": 3.910539369064603e-08, "epoch": 0.9650735294117647, "percentage": 96.51, "elapsed_time": "2:51:40", "remaining_time": "0:06:12"} +{"current_steps": 1060, "total_steps": 1088, "loss": 0.0788, "lr": 2.1635000958836748e-08, "epoch": 0.9742647058823529, "percentage": 97.43, "elapsed_time": "2:53:16", "remaining_time": "0:04:34"} +{"current_steps": 1070, "total_steps": 1088, "loss": 0.0792, "lr": 9.290669911672934e-09, "epoch": 0.9834558823529411, "percentage": 98.35, "elapsed_time": "2:54:53", "remaining_time": "0:02:56"} +{"current_steps": 1080, "total_steps": 1088, "loss": 0.0762, "lr": 2.085111108227067e-09, "epoch": 0.9926470588235294, "percentage": 99.26, "elapsed_time": "2:56:29", "remaining_time": "0:01:18"} +{"current_steps": 1088, "total_steps": 1088, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "2:58:23", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..c0b1d60 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,799 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1088, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009191176470588236, + "grad_norm": 11.266937255859375, + "learning_rate": 8.256880733944956e-07, + "loss": 0.578, + "step": 10 + }, + { + "epoch": 0.01838235294117647, + "grad_norm": 4.71966552734375, + "learning_rate": 1.743119266055046e-06, + "loss": 0.4698, + "step": 20 + }, + { + "epoch": 0.027573529411764705, + "grad_norm": 2.152991771697998, + "learning_rate": 2.6605504587155968e-06, + "loss": 0.254, + "step": 30 + }, + { + "epoch": 0.03676470588235294, + "grad_norm": 1.5794391632080078, + "learning_rate": 3.5779816513761473e-06, + "loss": 0.1899, + "step": 40 + }, + { + "epoch": 0.04595588235294118, + "grad_norm": 2.2544736862182617, + "learning_rate": 4.4954128440366975e-06, + "loss": 0.1712, + "step": 50 + }, + { + "epoch": 0.05514705882352941, + "grad_norm": 1.6429979801177979, + "learning_rate": 5.412844036697248e-06, + "loss": 0.1554, + "step": 60 + }, + { + "epoch": 0.06433823529411764, + "grad_norm": 1.428564190864563, + "learning_rate": 6.330275229357799e-06, + "loss": 0.1509, + "step": 70 + }, + { + "epoch": 0.07352941176470588, + "grad_norm": 1.1606788635253906, + "learning_rate": 7.247706422018349e-06, + "loss": 0.144, + "step": 80 + }, + { + "epoch": 0.08272058823529412, + "grad_norm": 1.4222913980484009, + "learning_rate": 8.1651376146789e-06, + "loss": 0.1377, + "step": 90 + }, + { + "epoch": 0.09191176470588236, + "grad_norm": 1.059951663017273, + "learning_rate": 9.08256880733945e-06, + "loss": 0.128, + "step": 100 + }, + { + "epoch": 0.10110294117647059, + "grad_norm": 1.0378493070602417, + "learning_rate": 1e-05, + "loss": 0.1237, + "step": 110 + }, + { + "epoch": 0.11029411764705882, + "grad_norm": 1.9791302680969238, + "learning_rate": 9.99742583072674e-06, + "loss": 0.1317, + "step": 120 + }, + { + "epoch": 0.11948529411764706, + "grad_norm": 1.28194260597229, + "learning_rate": 9.98970597344593e-06, + "loss": 0.1258, + "step": 130 + }, + { + "epoch": 0.12867647058823528, + "grad_norm": 1.228013515472412, + "learning_rate": 9.976848377045343e-06, + "loss": 0.1259, + "step": 140 + }, + { + "epoch": 0.13786764705882354, + "grad_norm": 0.9104840755462646, + "learning_rate": 9.958866280576803e-06, + "loss": 0.119, + "step": 150 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 0.957346498966217, + "learning_rate": 9.935778199624394e-06, + "loss": 0.1193, + "step": 160 + }, + { + "epoch": 0.15625, + "grad_norm": 1.0997158288955688, + "learning_rate": 9.90760790723954e-06, + "loss": 0.1159, + "step": 170 + }, + { + "epoch": 0.16544117647058823, + "grad_norm": 1.1610311269760132, + "learning_rate": 9.874384409462673e-06, + "loss": 0.1198, + "step": 180 + }, + { + "epoch": 0.17463235294117646, + "grad_norm": 0.9462277889251709, + "learning_rate": 9.836141915456646e-06, + "loss": 0.1109, + "step": 190 + }, + { + "epoch": 0.18382352941176472, + "grad_norm": 0.793471097946167, + "learning_rate": 9.792919802282656e-06, + "loss": 0.1158, + "step": 200 + }, + { + "epoch": 0.19301470588235295, + "grad_norm": 1.2811907529830933, + "learning_rate": 9.744762574354967e-06, + "loss": 0.1148, + "step": 210 + }, + { + "epoch": 0.20220588235294118, + "grad_norm": 0.8056549429893494, + "learning_rate": 9.691719817616148e-06, + "loss": 0.1054, + "step": 220 + }, + { + "epoch": 0.2113970588235294, + "grad_norm": 0.9176375269889832, + "learning_rate": 9.633846148480024e-06, + "loss": 0.1097, + "step": 230 + }, + { + "epoch": 0.22058823529411764, + "grad_norm": 0.8464528918266296, + "learning_rate": 9.571201157594925e-06, + "loss": 0.1055, + "step": 240 + }, + { + "epoch": 0.22977941176470587, + "grad_norm": 0.8142296671867371, + "learning_rate": 9.503849348485112e-06, + "loss": 0.1046, + "step": 250 + }, + { + "epoch": 0.23897058823529413, + "grad_norm": 0.8536105751991272, + "learning_rate": 9.431860071133592e-06, + "loss": 0.0973, + "step": 260 + }, + { + "epoch": 0.24816176470588236, + "grad_norm": 1.1437208652496338, + "learning_rate": 9.355307450574666e-06, + "loss": 0.1032, + "step": 270 + }, + { + "epoch": 0.25735294117647056, + "grad_norm": 1.4142391681671143, + "learning_rate": 9.27427031056979e-06, + "loss": 0.1057, + "step": 280 + }, + { + "epoch": 0.2665441176470588, + "grad_norm": 0.7978008389472961, + "learning_rate": 9.188832092445281e-06, + "loss": 0.105, + "step": 290 + }, + { + "epoch": 0.2757352941176471, + "grad_norm": 1.3588639497756958, + "learning_rate": 9.09908076917548e-06, + "loss": 0.1048, + "step": 300 + }, + { + "epoch": 0.2849264705882353, + "grad_norm": 0.6924364566802979, + "learning_rate": 9.00510875479983e-06, + "loss": 0.1001, + "step": 310 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.7341586947441101, + "learning_rate": 8.907012809267107e-06, + "loss": 0.1045, + "step": 320 + }, + { + "epoch": 0.30330882352941174, + "grad_norm": 0.703032910823822, + "learning_rate": 8.804893938804839e-06, + "loss": 0.1029, + "step": 330 + }, + { + "epoch": 0.3125, + "grad_norm": 0.8211079239845276, + "learning_rate": 8.698857291916456e-06, + "loss": 0.1, + "step": 340 + }, + { + "epoch": 0.32169117647058826, + "grad_norm": 0.7473457455635071, + "learning_rate": 8.58901205111326e-06, + "loss": 0.0923, + "step": 350 + }, + { + "epoch": 0.33088235294117646, + "grad_norm": 1.1483430862426758, + "learning_rate": 8.475471320492728e-06, + "loss": 0.1128, + "step": 360 + }, + { + "epoch": 0.3400735294117647, + "grad_norm": 0.8971941471099854, + "learning_rate": 8.35835200927887e-06, + "loss": 0.1044, + "step": 370 + }, + { + "epoch": 0.3492647058823529, + "grad_norm": 0.7846776247024536, + "learning_rate": 8.237774711444575e-06, + "loss": 0.0983, + "step": 380 + }, + { + "epoch": 0.3584558823529412, + "grad_norm": 0.7909674644470215, + "learning_rate": 8.113863581539905e-06, + "loss": 0.0908, + "step": 390 + }, + { + "epoch": 0.36764705882352944, + "grad_norm": 0.8026263117790222, + "learning_rate": 7.986746206854143e-06, + "loss": 0.0939, + "step": 400 + }, + { + "epoch": 0.37683823529411764, + "grad_norm": 0.6986457705497742, + "learning_rate": 7.856553476043294e-06, + "loss": 0.0967, + "step": 410 + }, + { + "epoch": 0.3860294117647059, + "grad_norm": 0.7079197764396667, + "learning_rate": 7.723419444358261e-06, + "loss": 0.0953, + "step": 420 + }, + { + "epoch": 0.3952205882352941, + "grad_norm": 1.0171252489089966, + "learning_rate": 7.5874811956124805e-06, + "loss": 0.0928, + "step": 430 + }, + { + "epoch": 0.40441176470588236, + "grad_norm": 0.8423835635185242, + "learning_rate": 7.4488787010311425e-06, + "loss": 0.0938, + "step": 440 + }, + { + "epoch": 0.41360294117647056, + "grad_norm": 1.0884760618209839, + "learning_rate": 7.3077546751273494e-06, + "loss": 0.0925, + "step": 450 + }, + { + "epoch": 0.4227941176470588, + "grad_norm": 0.7641892433166504, + "learning_rate": 7.164254428753581e-06, + "loss": 0.0953, + "step": 460 + }, + { + "epoch": 0.4319852941176471, + "grad_norm": 0.8474388122558594, + "learning_rate": 7.018525719479805e-06, + "loss": 0.0913, + "step": 470 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 0.8151172399520874, + "learning_rate": 6.870718599452279e-06, + "loss": 0.0974, + "step": 480 + }, + { + "epoch": 0.45036764705882354, + "grad_norm": 0.5582044124603271, + "learning_rate": 6.7209852608897005e-06, + "loss": 0.0901, + "step": 490 + }, + { + "epoch": 0.45955882352941174, + "grad_norm": 0.9465530514717102, + "learning_rate": 6.569479879375795e-06, + "loss": 0.0936, + "step": 500 + }, + { + "epoch": 0.46875, + "grad_norm": 0.7654430270195007, + "learning_rate": 6.416358455109695e-06, + "loss": 0.0933, + "step": 510 + }, + { + "epoch": 0.47794117647058826, + "grad_norm": 0.7761212587356567, + "learning_rate": 6.261778652277565e-06, + "loss": 0.093, + "step": 520 + }, + { + "epoch": 0.48713235294117646, + "grad_norm": 0.5924690365791321, + "learning_rate": 6.105899636710895e-06, + "loss": 0.0879, + "step": 530 + }, + { + "epoch": 0.4963235294117647, + "grad_norm": 0.591857373714447, + "learning_rate": 5.948881911998572e-06, + "loss": 0.0904, + "step": 540 + }, + { + "epoch": 0.5055147058823529, + "grad_norm": 0.7316629886627197, + "learning_rate": 5.790887154221521e-06, + "loss": 0.0883, + "step": 550 + }, + { + "epoch": 0.5147058823529411, + "grad_norm": 0.8223432302474976, + "learning_rate": 5.632078045480065e-06, + "loss": 0.0883, + "step": 560 + }, + { + "epoch": 0.5238970588235294, + "grad_norm": 0.7148771286010742, + "learning_rate": 5.472618106385415e-06, + "loss": 0.0814, + "step": 570 + }, + { + "epoch": 0.5330882352941176, + "grad_norm": 0.6548059582710266, + "learning_rate": 5.31267152768779e-06, + "loss": 0.0884, + "step": 580 + }, + { + "epoch": 0.5422794117647058, + "grad_norm": 0.7036318778991699, + "learning_rate": 5.152403001214483e-06, + "loss": 0.0882, + "step": 590 + }, + { + "epoch": 0.5514705882352942, + "grad_norm": 0.8155568242073059, + "learning_rate": 4.991977550292028e-06, + "loss": 0.0873, + "step": 600 + }, + { + "epoch": 0.5606617647058824, + "grad_norm": 0.654234766960144, + "learning_rate": 4.831560359826985e-06, + "loss": 0.086, + "step": 610 + }, + { + "epoch": 0.5698529411764706, + "grad_norm": 0.8868378400802612, + "learning_rate": 4.671316606220394e-06, + "loss": 0.0905, + "step": 620 + }, + { + "epoch": 0.5790441176470589, + "grad_norm": 0.5300816297531128, + "learning_rate": 4.511411287290964e-06, + "loss": 0.0881, + "step": 630 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.9155387282371521, + "learning_rate": 4.35200905238214e-06, + "loss": 0.088, + "step": 640 + }, + { + "epoch": 0.5974264705882353, + "grad_norm": 0.7309725880622864, + "learning_rate": 4.193274032828e-06, + "loss": 0.0873, + "step": 650 + }, + { + "epoch": 0.6066176470588235, + "grad_norm": 0.5966044664382935, + "learning_rate": 4.035369672952516e-06, + "loss": 0.0878, + "step": 660 + }, + { + "epoch": 0.6158088235294118, + "grad_norm": 0.924511194229126, + "learning_rate": 3.8784585617762084e-06, + "loss": 0.0849, + "step": 670 + }, + { + "epoch": 0.625, + "grad_norm": 1.1838688850402832, + "learning_rate": 3.7227022656034873e-06, + "loss": 0.0867, + "step": 680 + }, + { + "epoch": 0.6341911764705882, + "grad_norm": 0.7663735747337341, + "learning_rate": 3.568261161663042e-06, + "loss": 0.0896, + "step": 690 + }, + { + "epoch": 0.6433823529411765, + "grad_norm": 0.7709589600563049, + "learning_rate": 3.4152942729725896e-06, + "loss": 0.0893, + "step": 700 + }, + { + "epoch": 0.6525735294117647, + "grad_norm": 0.7545693516731262, + "learning_rate": 3.263959104598009e-06, + "loss": 0.0861, + "step": 710 + }, + { + "epoch": 0.6617647058823529, + "grad_norm": 0.6413795948028564, + "learning_rate": 3.114411481475455e-06, + "loss": 0.0776, + "step": 720 + }, + { + "epoch": 0.6709558823529411, + "grad_norm": 0.7467756271362305, + "learning_rate": 2.966805387963463e-06, + "loss": 0.084, + "step": 730 + }, + { + "epoch": 0.6801470588235294, + "grad_norm": 0.6672917008399963, + "learning_rate": 2.821292809290217e-06, + "loss": 0.0834, + "step": 740 + }, + { + "epoch": 0.6893382352941176, + "grad_norm": 0.7798132300376892, + "learning_rate": 2.678023575059274e-06, + "loss": 0.083, + "step": 750 + }, + { + "epoch": 0.6985294117647058, + "grad_norm": 0.729383647441864, + "learning_rate": 2.5371452049748603e-06, + "loss": 0.0842, + "step": 760 + }, + { + "epoch": 0.7077205882352942, + "grad_norm": 0.8981873989105225, + "learning_rate": 2.3988027569455895e-06, + "loss": 0.0874, + "step": 770 + }, + { + "epoch": 0.7169117647058824, + "grad_norm": 0.7742981314659119, + "learning_rate": 2.2631386777230248e-06, + "loss": 0.0815, + "step": 780 + }, + { + "epoch": 0.7261029411764706, + "grad_norm": 0.6819539070129395, + "learning_rate": 2.130292656228856e-06, + "loss": 0.0814, + "step": 790 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 0.7190724611282349, + "learning_rate": 2.0004014797217207e-06, + "loss": 0.0826, + "step": 800 + }, + { + "epoch": 0.7444852941176471, + "grad_norm": 0.674633800983429, + "learning_rate": 1.873598892951795e-06, + "loss": 0.0851, + "step": 810 + }, + { + "epoch": 0.7536764705882353, + "grad_norm": 0.7690839767456055, + "learning_rate": 1.7500154604481312e-06, + "loss": 0.0804, + "step": 820 + }, + { + "epoch": 0.7628676470588235, + "grad_norm": 0.6933445930480957, + "learning_rate": 1.629778432080586e-06, + "loss": 0.084, + "step": 830 + }, + { + "epoch": 0.7720588235294118, + "grad_norm": 0.944789707660675, + "learning_rate": 1.513011612034726e-06, + "loss": 0.0776, + "step": 840 + }, + { + "epoch": 0.78125, + "grad_norm": 0.645876944065094, + "learning_rate": 1.3998352313346768e-06, + "loss": 0.0797, + "step": 850 + }, + { + "epoch": 0.7904411764705882, + "grad_norm": 0.72840416431427, + "learning_rate": 1.2903658240450989e-06, + "loss": 0.0792, + "step": 860 + }, + { + "epoch": 0.7996323529411765, + "grad_norm": 0.8772525191307068, + "learning_rate": 1.184716107279837e-06, + "loss": 0.0804, + "step": 870 + }, + { + "epoch": 0.8088235294117647, + "grad_norm": 0.8419274091720581, + "learning_rate": 1.0829948651407374e-06, + "loss": 0.0817, + "step": 880 + }, + { + "epoch": 0.8180147058823529, + "grad_norm": 0.6790868043899536, + "learning_rate": 9.85306836706184e-07, + "loss": 0.0798, + "step": 890 + }, + { + "epoch": 0.8272058823529411, + "grad_norm": 0.7450520992279053, + "learning_rate": 8.917526081846411e-07, + "loss": 0.0772, + "step": 900 + }, + { + "epoch": 0.8363970588235294, + "grad_norm": 0.9222913980484009, + "learning_rate": 8.024285093442874e-07, + "loss": 0.0796, + "step": 910 + }, + { + "epoch": 0.8455882352941176, + "grad_norm": 0.5793068408966064, + "learning_rate": 7.17426514325359e-07, + "loss": 0.0754, + "step": 920 + }, + { + "epoch": 0.8547794117647058, + "grad_norm": 0.8552721738815308, + "learning_rate": 6.36834146937354e-07, + "loss": 0.0816, + "step": 930 + }, + { + "epoch": 0.8639705882352942, + "grad_norm": 0.6834543943405151, + "learning_rate": 5.607343905385898e-07, + "loss": 0.077, + "step": 940 + }, + { + "epoch": 0.8731617647058824, + "grad_norm": 0.8566415309906006, + "learning_rate": 4.892056025909148e-07, + "loss": 0.0788, + "step": 950 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.6394233107566833, + "learning_rate": 4.2232143397756607e-07, + "loss": 0.0761, + "step": 960 + }, + { + "epoch": 0.8915441176470589, + "grad_norm": 0.6848531365394592, + "learning_rate": 3.6015075316722605e-07, + "loss": 0.0762, + "step": 970 + }, + { + "epoch": 0.9007352941176471, + "grad_norm": 0.7035109996795654, + "learning_rate": 3.02757575302392e-07, + "loss": 0.0827, + "step": 980 + }, + { + "epoch": 0.9099264705882353, + "grad_norm": 1.1025725603103638, + "learning_rate": 2.5020099628504603e-07, + "loss": 0.0786, + "step": 990 + }, + { + "epoch": 0.9191176470588235, + "grad_norm": 0.9067655801773071, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.0713, + "step": 1000 + }, + { + "epoch": 0.9283088235294118, + "grad_norm": 2.0745654106140137, + "learning_rate": 1.5980906223115933e-07, + "loss": 0.0823, + "step": 1010 + }, + { + "epoch": 0.9375, + "grad_norm": 0.77626633644104, + "learning_rate": 1.220667808502951e-07, + "loss": 0.0797, + "step": 1020 + }, + { + "epoch": 0.9466911764705882, + "grad_norm": 0.6542201042175293, + "learning_rate": 8.934714979333403e-08, + "loss": 0.0813, + "step": 1030 + }, + { + "epoch": 0.9558823529411765, + "grad_norm": 0.7697402238845825, + "learning_rate": 6.168385940783727e-08, + "loss": 0.0745, + "step": 1040 + }, + { + "epoch": 0.9650735294117647, + "grad_norm": 1.0100551843643188, + "learning_rate": 3.910539369064603e-08, + "loss": 0.0794, + "step": 1050 + }, + { + "epoch": 0.9742647058823529, + "grad_norm": 0.5885272026062012, + "learning_rate": 2.1635000958836748e-08, + "loss": 0.0788, + "step": 1060 + }, + { + "epoch": 0.9834558823529411, + "grad_norm": 0.6646335124969482, + "learning_rate": 9.290669911672934e-09, + "loss": 0.0792, + "step": 1070 + }, + { + "epoch": 0.9926470588235294, + "grad_norm": 0.5683983564376831, + "learning_rate": 2.085111108227067e-09, + "loss": 0.0762, + "step": 1080 + }, + { + "epoch": 1.0, + "step": 1088, + "total_flos": 1.9483786471612088e+18, + "train_loss": 0.10519510776023655, + "train_runtime": 10705.8335, + "train_samples_per_second": 3.25, + "train_steps_per_second": 0.102 + } + ], + "logging_steps": 10, + "max_steps": 1088, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.9483786471612088e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..316ba0e --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816f02fc6899b6403a33b78c123fbc889abfbe709e82cc155e969b284d634abd +size 5688 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..52a7891 Binary files /dev/null and b/training_loss.png differ diff --git a/video_preprocessor_config.json b/video_preprocessor_config.json new file mode 100644 index 0000000..b64d80b --- /dev/null +++ b/video_preprocessor_config.json @@ -0,0 +1,86 @@ +{ + "_valid_kwargs_names": [ + "do_convert_rgb", + "do_resize", + "size", + "size_divisor", + "default_to_square", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_pad", + "do_center_crop", + "crop_size", + "data_format", + "input_data_format", + "device", + "min_pixels", + "max_pixels", + "patch_size", + "temporal_patch_size", + "merge_size" + ], + "crop_size": null, + "data_format": "channels_first", + "default_to_square": true, + "device": null, + "do_center_crop": null, + "do_convert_rgb": true, + "do_normalize": true, + "do_pad": null, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "Qwen2VLImageProcessor", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "input_data_format": null, + "max_pixels": 12845056, + "merge_size": 2, + "min_pixels": 3136, + "model_valid_processing_keys": [ + "do_convert_rgb", + "do_resize", + "size", + "size_divisor", + "default_to_square", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_pad", + "do_center_crop", + "crop_size", + "data_format", + "input_data_format", + "device", + "min_pixels", + "max_pixels", + "patch_size", + "temporal_patch_size", + "merge_size" + ], + "patch_size": 14, + "processor_class": "Qwen2_5_VLProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "longest_edge": 12845056, + "shortest_edge": 3136 + }, + "size_divisor": null, + "temporal_patch_size": 2, + "video_processor_type": "Qwen2VLVideoProcessor" +} diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833