commit 0fd06ce448866d820fb2dce71aa5ac156ebaefea Author: ModelHub XC Date: Sat Apr 18 14:08:36 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: kth8/Llama-3.3-8B-Instruct-SuperGPQA-Classifier Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..6c3785e --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +--- +license: llama3.2 +language: +- en +base_model: allura-forge/Llama-3.3-8B-Instruct +datasets: +- m-a-p/SuperGPQA +pipeline_tag: text-generation +library_name: transformers +tags: +- sft +- trl +- unsloth +- llama +- llama3 +- llama3.3 +--- +![logo](https://i.imgur.com/iuUzm7L.jpeg) +A fine-tune of [allura-forge/Llama-3.3-8B-Instruct](https://huggingface.co/allura-forge/Llama-3.3-8B-Instruct) on the [m-a-p/SuperGPQA](https://huggingface.co/datasets/m-a-p/SuperGPQA) dataset. + +## Usage example +Set temperature as 0.0 for best results. + +**System prompt** +``` +You are a classifier. Categorize the following problem into discipline, field, and subfield in JSON format. +``` +**User prompt** +``` +Cotton and linen both readily catch fire. A batch of towels is composed of both cotton and linen, and is known to have caught fire. If it is known that the towels were ignited by a lit cigarette, which of the following arguments utilizes the most appropriate form of reasoning? +``` +**Assistant response** +``` +{"discipline": "Philosophy", "field": "Philosophy", "subfield": "Logic"} +``` +# Possible output options +Discipline +``` +['Medicine', 'Literature and Arts', 'History', 'Science', 'Philosophy', 'Law', 'Engineering', 'Management', 'Agronomy', 'Economics', 'Military Science', 'Sociology', 'Education'] +``` +Field +``` +['Animal Husbandry', 'Political Science', 'Civil Engineering', 'Materials Science and Engineering', 'Weapon Science and Technology', 'History', 'Stomatology', 'Agricultural Engineering', 'Mechanical Engineering', 'Astronomy', 'Nuclear Science and Technology', 'Language and Literature', 'Forestry Engineering', 'Geology', 'Basic Medicine', 'Crop Science', 'Electronic Science and Technology', 'Military Science', 'Petroleum and Natural Gas Engineering', 'Metallurgical Engineering', 'Management Science and Engineering', 'Library, Information and Archival Management', 'Clinical Medicine', 'Art Studies', 'Food Science and Engineering', 'Systems Science', 'Aquaculture', 'Business Administration', 'Computer Science and Technology', 'Electrical Engineering', 'Forestry', 'Textile Science and Engineering', 'Physical Education', 'Oceanography', 'Musicology', 'Traditional Chinese Medicine', 'Mining Engineering', 'Psychology', 'Law', 'Control Science and Engineering', 'Chemistry', 'Hydraulic Engineering', 'Public Administration', 'Chemical Engineering and Technology', 'Geography', 'Optical Engineering', 'Applied Economics', 'Architecture', 'Power Engineering and Engineering Thermophysics', 'Education', 'Journalism and Communication', 'Aeronautical and Astronautical Science and Technology', 'Veterinary Medicine', 'Geophysics', 'Instrument Science and Technology', 'Mathematics', 'Information and Communication Engineering', 'Physical Oceanography', 'Theoretical Economics', 'Mechanics', 'Philosophy', 'Geological Resources and Geological Engineering', 'Physics', 'Pharmacy', 'Environmental Science and Engineering', 'Transportation Engineering', 'Biology', 'Naval Architecture and Ocean Engineering', 'Atmospheric Science', 'Sociology', 'Public Health and Preventive Medicine', 'Surveying and Mapping Science and Technology'] +``` +Subfield +``` +['Political Science', 'Social Medicine and Health Management', 'Preschool Education', 'Geriatric Medicine', 'Civil and Commercial Law', 'Biophysics', 'Rigid Body Mechanics', 'Cartography and Geographic Information Engineering', 'Anesthesiology', 'Stellar and Interstellar Evolution', 'Chemical Transport Engineering', 'Structural Geology', 'Contract Law', 'Obstetrics and Gynecology', 'Pathology and Pathophysiology', 'Harmony', 'Aquaculture', 'Pharmaceutics', 'Vehicle Operation Engineering', 'Circuits and Systems', 'Solid State Physics', 'Theoretical Fluid Mechanics', 'Mineral Processing Engineering', 'Functions of Real Variables', 'Signal and Information Processing', 'Pathogen Biology', 'Computer Networks', 'Optical Fiber Communication', 'Genetics', 'Architectural History', 'Oil and Gas Field Development and Storage & Transportation Engineering', 'Tourism Management and Technological Economics Management', 'Drama and Opera Studies', 'Polynomials and Series Expansions', 'Cryptography', 'Polymer Chemistry and Physics', 'Principles of Seismic Exploration', 'Fuzzy Mathematics', 'Physiology', 'Pitch and Scales', 'Heat Transfer', 'Operating Systems', 'Fluid Physics', 'Microelectronics and Solid-State Electronics', 'Non-ferrous Metallurgy', 'Environmental Science', 'Power Electronics and Electrical Drives', 'Communication and Information Systems', 'Oncology', 'Military Thought and History', 'Procedural Law', 'Group Theory', 'Fine Arts', 'Transportation Planning and Management', 'Physical Chemistry', 'Physical Oceanography', 'Sports Science and Medicine', 'Animal Nutrition and Feed Science', 'Urban Planning and Design', 'Space physics', 'Electrical Theory and New Technologies', 'Economic History', 'Geotechnical Engineering', 'Ecology', 'Theory of Curriculum and Instruction', 'Radiation Medicine', 'Information Management Science', 'Functions of Complex Variables', 'Computer Software and Theory', 'Nursing and Rehabilitation Medicine', 'Wood Science and Technology', 'Mass Transport and Separation Process in Chemical Engineering', 'Religious Studies', 'Mineralogy, Petrology, and Economic Geology', 'Thermodynamics and Statistical Physics', 'Structural Engineering', 'Demography and Anthropology', 'Philology and Bibliography', 'Databases', 'Textile Materials Science', 'Textile Chemistry and Dyeing Engineering', 'Physical Chemistry of Metallurgical Process', 'Ethics', 'Internal Combustion Engineering', 'Design Arts', 'Refrigeration and Cryogenic Engineering', 'Mechatronic Engineering', 'Dermatology and Venereology', 'Economic Statistics', 'Applied Optics', 'Systems Science', 'Particle and Nuclear Physics', 'Information Management and Communication', 'French Language and Literature', 'Labor Economics', 'Medicinal Chemistry', 'Literary Theory', 'Microbiology', 'Physical Education and Training', 'Internal Medicine', 'Computer Architecture', 'Operations Research and Cybernetics', 'Dynamic Meteorology', 'Industrial Economics', 'Literary History', 'Marine Engineering', 'Optoelectronic Technology', 'Combinatorial Mathematics', 'Theoretical Optics', 'Materials Processing Engineering', 'Nutrition and Food Hygiene', 'Theoretical Mechanics', 'Graph Theory', 'Quantum Mechanics', 'Materials Physics and Chemistry', 'Marine Biology', 'Forest Cultivation and Genetic Breeding', 'National and Defense Economics', 'Poromechanics and Reservoir Physics', 'Road and Railway Engineering', 'Aeronautical and Astronautical Science and Technology', 'Data Structures', 'Historical Geography', 'Analytical Chemistry', 'Military Law', 'Pharmaceutical Analysis', 'Polymer Physics', 'Atmospheric Physics and Atmospheric Environment', 'Communication Principles', 'Underwater Acoustics', 'Journalism and News Practice', 'Water conservancy and Hydropower Engineering', 'Inorganic Chemistry', 'Animal Rearing and Breeding', 'Educational Technology and Principles', 'High Voltage and Insulation Technology', 'Advanced Algebra', 'Food Biochemistry', 'Philosophy of Science and Technology', 'Logic', 'Film Studies', 'Military Command and Information Systems', 'Fundamentals of Dynamics and Control', 'Neurology', 'Cosmology', 'Forest Engineering', 'Ophthalmology', 'Agricultural Environment and Soil-Water Engineering', 'Crop Science', 'Human Anatomy and Histology-Embryology', 'Probability and Statistics', 'Communication and Broadcasting', 'Maternal, Child and Adolescent Health', 'Thermodynamics', 'Surgery', 'Architectural Design and Theory', 'Western Economics', 'Ordinary Differential Equations', 'Management Science and Engineering', 'Military Logistics and Equipment', 'Discrete Mathematics', 'Mathematical Analysis', 'Astrophysics', 'Linguistics and Applied Linguistics', 'Quantitative Economics', 'Cell Biology', 'Urban Infrastructure Engineering', 'Pattern Recognition', 'Forensic Medicine', 'Antenna and Radio Communication', 'Constitutional and Administrative Law', 'Laser Technology', 'Traditional Chinese Medicine Theory', 'Biochemistry and Molecular Biology', 'Mining and Safety Engineering', 'Geometry and Topology', 'Dance Studies', 'Semiconductor Physics', 'Computational Mathematics', 'Fluid Machinery and Engineering', 'Philosophical Aesthetics', 'Engineering Fluid Mechanics', 'Stochastic Processes', 'Psychology', 'Traditional Chinese Pharmacy', 'Environmental and Resource Protection', 'Physical Geography', 'Archaeology and Museology', 'Power Systems and Automation', 'Music History, Education, and Technology', 'Pediatrics', 'Modern and Contemporary Chinese Literature', 'Geodesy and Surveying Engineering', 'Zoology', 'Military Management', 'Iron and Steel Metallurgy', 'Epidemiology and Health Statistics', 'Weapon Systems Science and Engineering', 'Military Chemistry and Pyrotechnics', 'World History', 'Meteorology', 'Finance', 'Electrodynamics', 'Organic Chemistry', 'History and Theory of Journalism and Media Management', 'Solar System Science', 'Geochemistry', 'Veterinary Medicine', 'Geological Resources and Geological Engineering', 'Psychiatry and Mental Health', 'Manufacturing Automation', 'Traditional Chinese Health Preservation', 'Business and Accounting Management', 'Atomic and Molecular Physics', 'Education Economics, Management and Social Security', 'Hydraulics and Hydrology', 'Otorhinolaryngology', 'Number Theory', 'Principles of Metallurgy', 'Radiation Protection and Nuclear Technology Applications', 'Principles of Computer Organization', 'Special Education', 'Digital Surveying and Remote Sensing Applications', 'Marine Chemistry', 'Composition', 'Traffic Information Engineering and Control', 'Emergency Medicine', 'Fluid Flow and Heat Transfer in Chemical Engineering', 'Criminal Law', 'Classical Chinese Literature', 'Numerical Analysis', 'Food Processing and Storage Engineering', 'Electrochemistry', 'Russian Language and Literature', 'Special Number Theory', 'Political Economy', 'Imaging and Nuclear Medicine', 'Fundamental Mathematics', 'Pharmacology', 'Astronomical Observation and Technology', 'Human Geography', 'Subatomic and Atomic Physics', 'Bridge and Tunnel Engineering', 'Acoustics', 'Social and Folklore Studies', 'Radiochemistry', 'Musical Forms and Analysis', 'Control Theory and Control Engineering', 'Engineering Thermophysics', 'Health Toxicology and Environmental Health', 'Legal Theory and Legal History', 'Relativity', 'Nuclear Energy and Reactor Technology', 'Guidance, Navigation and Control', 'Library and Archival Science', 'Sports Humanities and Sociology', 'Agricultural Mechanization Engineering', 'Formal Languages', 'Thermal Energy Engineering', 'Law and Social Governance', 'International Trade', 'Electromagnetic Field and Microwave Technology', 'Instrumentation and Performance', 'Broadcasting and Television Art', 'Solid Mechanics', 'Environmental Engineering', 'Advanced Programming Languages', 'Solid Earth Geophysics', 'Statistical Mechanics', 'Hydrogeology', 'Land Resource Management and Administrative Management', 'Power Machinery and Engineering', 'Paleontology and Stratigraphy', 'Landscape Plants and Ornamental Horticulture', 'Instrument Science and Technology', 'Immunology', 'Clinical Stomatology', 'Clinical Laboratory Diagnostics', 'Botany', 'Microbiology and Biochemical Pharmacy', 'Ship Mechanics and Design Principles', 'International Law', 'Elements of Chemical Reaction Engineering', 'Basic Stomatology', 'Public Finance'] +``` +## Model Details +- Base Model: `allura-forge/Llama-3.3-8B-Instruct` +- Parameter Count: 8,030,261,248 +- Precision: torch.bfloat16 + +## Hardware +- GPU: NVIDIA RTX PRO 6000 Blackwell Server Edition +- Announced: Mar 17th, 2025 +- Release Date: Mar 18th, 2025 +- Memory Type: GDDR7 +- Bandwidth: 1.79 TB/s +- Memory Size: 96 GB +- Memory Bus: 512 bit +- Shading Units: 24064 +- TDP: 600W + +## Training Settings +### PEFT +- Rank: 32 +- LoRA alpha: 64 +- Modules: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj +- Gradient checkpointing: unsloth + +### SFT +- Epoch: 2 +- Batch size: 32 +- Gradient Accumulation steps: 1 +- Warmup ratio: 0.05 +- Learning rate: 0.0002 +- Optimizer: adamw_torch_fused +- Learning rate scheduler: cosine + +## Training stats +- Date: 2026-03-26T03:53:29.234881 +- Peak VRAM usage: 32.135 GB +- Global step: 1576 +- Training runtime (seconds): 2681.8444 +- Average training loss: 0.06838441643920647 +- Final validation loss: 0.0504293330013752 + +## Framework versions +- Unsloth: 2026.3.15 +- TRL: 0.22.2 +- Transformers: 4.56.2 +- Pytorch: 2.10.0+cu128 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## License +This model is released under the Llama3 license. See the [Terms of Use](https://www.llama.com/llama3/license/) for details. \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..39bd0c9 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5d9fb1e --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.56.2", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..416142d --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "max_length": 4096, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..11b4c36 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d42e32f551f12ce79b67b7d6a3959ae9367ab673298ecddd32778e8b8b8f75 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..81e069f --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e86540e301b30ce4d7315cbc99b835852b14405901816fcc96b8c60e3426aab +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..85c233d --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc9b6bc2d9e7c7f3e8f19292c857ea68b600326ca602945a63505764bae0fd7 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..71a491e --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce3f749a1e7d3a08afe65b6f7beeffa960918d97b4e54449ae5af1ac77a71b49 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..348af68 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|reserved_special_token_250|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..86a3394 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ad03d76 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|reserved_special_token_250|>", + "padding_side": "left", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train/log.json b/train/log.json new file mode 100644 index 0000000..106ba2e --- /dev/null +++ b/train/log.json @@ -0,0 +1,1190 @@ +[ + { + "loss": 1.8996, + "grad_norm": 5.035277843475342, + "learning_rate": 2.278481012658228e-05, + "epoch": 0.012690355329949238, + "step": 10 + }, + { + "loss": 0.5315, + "grad_norm": 1.102072834968567, + "learning_rate": 4.810126582278481e-05, + "epoch": 0.025380710659898477, + "step": 20 + }, + { + "loss": 0.3353, + "grad_norm": 0.7798988819122314, + "learning_rate": 7.341772151898734e-05, + "epoch": 0.03807106598984772, + "step": 30 + }, + { + "loss": 0.2226, + "grad_norm": 0.8653473854064941, + "learning_rate": 9.873417721518988e-05, + "epoch": 0.050761421319796954, + "step": 40 + }, + { + "loss": 0.164, + "grad_norm": 0.7569780349731445, + "learning_rate": 0.0001240506329113924, + "epoch": 0.06345177664974619, + "step": 50 + }, + { + "loss": 0.1394, + "grad_norm": 1.0211968421936035, + "learning_rate": 0.00014936708860759494, + "epoch": 0.07614213197969544, + "step": 60 + }, + { + "loss": 0.1201, + "grad_norm": 0.5370887517929077, + "learning_rate": 0.00017468354430379748, + "epoch": 0.08883248730964467, + "step": 70 + }, + { + "loss": 0.122, + "grad_norm": 0.49917498230934143, + "learning_rate": 0.0002, + "epoch": 0.10152284263959391, + "step": 80 + }, + { + "loss": 0.1217, + "grad_norm": 0.4577413499355316, + "learning_rate": 0.0001999779803602204, + "epoch": 0.11421319796954314, + "step": 90 + }, + { + "loss": 0.0965, + "grad_norm": 0.48522070050239563, + "learning_rate": 0.00019991193113817244, + "epoch": 0.12690355329949238, + "step": 100 + }, + { + "loss": 0.11, + "grad_norm": 0.41902250051498413, + "learning_rate": 0.00019980188142145754, + "epoch": 0.13959390862944163, + "step": 110 + }, + { + "loss": 0.0823, + "grad_norm": 0.5561641454696655, + "learning_rate": 0.00019964787967517817, + "epoch": 0.15228426395939088, + "step": 120 + }, + { + "loss": 0.0856, + "grad_norm": 0.3316971957683563, + "learning_rate": 0.00019944999372059388, + "epoch": 0.1649746192893401, + "step": 130 + }, + { + "loss": 0.0849, + "grad_norm": 0.372153639793396, + "learning_rate": 0.00019920831070525342, + "epoch": 0.17766497461928935, + "step": 140 + }, + { + "loss": 0.0929, + "grad_norm": 0.33250877261161804, + "learning_rate": 0.00019892293706461555, + "epoch": 0.19035532994923857, + "step": 150 + }, + { + "eval_loss": 0.08791538327932358, + "eval_runtime": 29.62, + "eval_samples_per_second": 44.801, + "eval_steps_per_second": 11.209, + "epoch": 0.19923857868020303, + "step": 157 + }, + { + "loss": 0.0824, + "grad_norm": 0.4130192995071411, + "learning_rate": 0.00019859399847517567, + "epoch": 0.20304568527918782, + "step": 160 + }, + { + "loss": 0.0902, + "grad_norm": 0.3217241168022156, + "learning_rate": 0.0001982216397991188, + "epoch": 0.21573604060913706, + "step": 170 + }, + { + "loss": 0.0766, + "grad_norm": 0.4728490710258484, + "learning_rate": 0.0001978060250205232, + "epoch": 0.22842639593908629, + "step": 180 + }, + { + "loss": 0.0844, + "grad_norm": 0.5730077028274536, + "learning_rate": 0.0001973473371731431, + "epoch": 0.24111675126903553, + "step": 190 + }, + { + "loss": 0.0841, + "grad_norm": 0.5745298862457275, + "learning_rate": 0.00019684577825980192, + "epoch": 0.25380710659898476, + "step": 200 + }, + { + "loss": 0.0797, + "grad_norm": 0.3141058683395386, + "learning_rate": 0.0001963015691634317, + "epoch": 0.26649746192893403, + "step": 210 + }, + { + "loss": 0.0822, + "grad_norm": 0.3730680048465729, + "learning_rate": 0.00019571494954979775, + "epoch": 0.27918781725888325, + "step": 220 + }, + { + "loss": 0.0677, + "grad_norm": 0.3915182650089264, + "learning_rate": 0.00019508617776195167, + "epoch": 0.2918781725888325, + "step": 230 + }, + { + "loss": 0.08, + "grad_norm": 0.3052193820476532, + "learning_rate": 0.00019441553070645887, + "epoch": 0.30456852791878175, + "step": 240 + }, + { + "loss": 0.0744, + "grad_norm": 0.3673352003097534, + "learning_rate": 0.000193703303731451, + "epoch": 0.31725888324873097, + "step": 250 + }, + { + "loss": 0.0821, + "grad_norm": 0.39443644881248474, + "learning_rate": 0.00019294981049655668, + "epoch": 0.3299492385786802, + "step": 260 + }, + { + "loss": 0.073, + "grad_norm": 0.44178199768066406, + "learning_rate": 0.0001921553828347681, + "epoch": 0.3426395939086294, + "step": 270 + }, + { + "loss": 0.0784, + "grad_norm": 0.4202715754508972, + "learning_rate": 0.00019132037060630409, + "epoch": 0.3553299492385787, + "step": 280 + }, + { + "loss": 0.0646, + "grad_norm": 0.23640507459640503, + "learning_rate": 0.00019044514154453434, + "epoch": 0.3680203045685279, + "step": 290 + }, + { + "loss": 0.0785, + "grad_norm": 0.4354120194911957, + "learning_rate": 0.0001895300810940321, + "epoch": 0.38071065989847713, + "step": 300 + }, + { + "loss": 0.0656, + "grad_norm": 0.2467317283153534, + "learning_rate": 0.00018857559224082736, + "epoch": 0.3934010152284264, + "step": 310 + }, + { + "eval_loss": 0.0728072002530098, + "eval_runtime": 19.9827, + "eval_samples_per_second": 66.407, + "eval_steps_per_second": 16.614, + "epoch": 0.39847715736040606, + "step": 314 + }, + { + "loss": 0.0738, + "grad_norm": 0.2969267666339874, + "learning_rate": 0.00018758209533493444, + "epoch": 0.40609137055837563, + "step": 320 + }, + { + "loss": 0.067, + "grad_norm": 0.3527528643608093, + "learning_rate": 0.00018655002790523328, + "epoch": 0.41878172588832485, + "step": 330 + }, + { + "loss": 0.0714, + "grad_norm": 0.2732889950275421, + "learning_rate": 0.00018547984446678437, + "epoch": 0.43147208121827413, + "step": 340 + }, + { + "loss": 0.0602, + "grad_norm": 0.25770312547683716, + "learning_rate": 0.000184372016320664, + "epoch": 0.44416243654822335, + "step": 350 + }, + { + "loss": 0.0624, + "grad_norm": 0.22473905980587006, + "learning_rate": 0.00018322703134640654, + "epoch": 0.45685279187817257, + "step": 360 + }, + { + "loss": 0.0709, + "grad_norm": 0.3180300295352936, + "learning_rate": 0.00018204539378714561, + "epoch": 0.46954314720812185, + "step": 370 + }, + { + "loss": 0.0698, + "grad_norm": 0.2796868085861206, + "learning_rate": 0.00018082762402754936, + "epoch": 0.48223350253807107, + "step": 380 + }, + { + "loss": 0.0658, + "grad_norm": 0.3655967712402344, + "learning_rate": 0.0001795742583646466, + "epoch": 0.4949238578680203, + "step": 390 + }, + { + "loss": 0.0682, + "grad_norm": 0.2886195182800293, + "learning_rate": 0.0001782858487716455, + "epoch": 0.5076142131979695, + "step": 400 + }, + { + "loss": 0.071, + "grad_norm": 0.27021610736846924, + "learning_rate": 0.00017696296265484862, + "epoch": 0.5203045685279187, + "step": 410 + }, + { + "loss": 0.0636, + "grad_norm": 0.28307008743286133, + "learning_rate": 0.00017560618260377116, + "epoch": 0.5329949238578681, + "step": 420 + }, + { + "loss": 0.0546, + "grad_norm": 0.28294482827186584, + "learning_rate": 0.00017421610613457282, + "epoch": 0.5456852791878173, + "step": 430 + }, + { + "loss": 0.0612, + "grad_norm": 0.2255251258611679, + "learning_rate": 0.00017279334542691596, + "epoch": 0.5583756345177665, + "step": 440 + }, + { + "loss": 0.0629, + "grad_norm": 0.22404751181602478, + "learning_rate": 0.0001713385270543661, + "epoch": 0.5710659898477157, + "step": 450 + }, + { + "loss": 0.0596, + "grad_norm": 0.2632795572280884, + "learning_rate": 0.00016985229170845339, + "epoch": 0.583756345177665, + "step": 460 + }, + { + "loss": 0.0717, + "grad_norm": 0.3002878427505493, + "learning_rate": 0.0001683352939165167, + "epoch": 0.5964467005076142, + "step": 470 + }, + { + "eval_loss": 0.06722872704267502, + "eval_runtime": 20.1214, + "eval_samples_per_second": 65.95, + "eval_steps_per_second": 16.5, + "epoch": 0.5977157360406091, + "step": 471 + }, + { + "loss": 0.0618, + "grad_norm": 0.15326248109340668, + "learning_rate": 0.00016678820175345454, + "epoch": 0.6091370558375635, + "step": 480 + }, + { + "loss": 0.0718, + "grad_norm": 0.27122628688812256, + "learning_rate": 0.00016521169654750968, + "epoch": 0.6218274111675127, + "step": 490 + }, + { + "loss": 0.0636, + "grad_norm": 0.29509711265563965, + "learning_rate": 0.00016360647258021696, + "epoch": 0.6345177664974619, + "step": 500 + }, + { + "loss": 0.0655, + "grad_norm": 0.4090014100074768, + "learning_rate": 0.00016197323678064697, + "epoch": 0.6472081218274112, + "step": 510 + }, + { + "loss": 0.0606, + "grad_norm": 0.2687474191188812, + "learning_rate": 0.00016031270841407926, + "epoch": 0.6598984771573604, + "step": 520 + }, + { + "loss": 0.0519, + "grad_norm": 0.25125357508659363, + "learning_rate": 0.00015862561876524338, + "epoch": 0.6725888324873096, + "step": 530 + }, + { + "loss": 0.0623, + "grad_norm": 0.21579739451408386, + "learning_rate": 0.0001569127108162662, + "epoch": 0.6852791878172588, + "step": 540 + }, + { + "loss": 0.0612, + "grad_norm": 0.24012021720409393, + "learning_rate": 0.000155174738919468, + "epoch": 0.6979695431472082, + "step": 550 + }, + { + "loss": 0.0617, + "grad_norm": 0.22273781895637512, + "learning_rate": 0.00015341246846515096, + "epoch": 0.7106598984771574, + "step": 560 + }, + { + "loss": 0.0627, + "grad_norm": 0.29965269565582275, + "learning_rate": 0.0001516266755445271, + "epoch": 0.7233502538071066, + "step": 570 + }, + { + "loss": 0.0649, + "grad_norm": 0.2375640720129013, + "learning_rate": 0.00014981814660793314, + "epoch": 0.7360406091370558, + "step": 580 + }, + { + "loss": 0.0653, + "grad_norm": 0.2595769166946411, + "learning_rate": 0.0001479876781184833, + "epoch": 0.748730964467005, + "step": 590 + }, + { + "loss": 0.0634, + "grad_norm": 0.28185659646987915, + "learning_rate": 0.00014613607620131294, + "epoch": 0.7614213197969543, + "step": 600 + }, + { + "loss": 0.0601, + "grad_norm": 0.20655085146427155, + "learning_rate": 0.00014426415628856663, + "epoch": 0.7741116751269036, + "step": 610 + }, + { + "loss": 0.0632, + "grad_norm": 0.4992614686489105, + "learning_rate": 0.0001423727427602879, + "epoch": 0.7868020304568528, + "step": 620 + }, + { + "eval_loss": 0.05841095373034477, + "eval_runtime": 20.0018, + "eval_samples_per_second": 66.344, + "eval_steps_per_second": 16.599, + "epoch": 0.7969543147208121, + "step": 628 + }, + { + "loss": 0.0522, + "grad_norm": 0.2023015171289444, + "learning_rate": 0.0001404626685813681, + "epoch": 0.799492385786802, + "step": 630 + }, + { + "loss": 0.0567, + "grad_norm": 0.20891991257667542, + "learning_rate": 0.00013853477493471468, + "epoch": 0.8121827411167513, + "step": 640 + }, + { + "loss": 0.0555, + "grad_norm": 0.27132412791252136, + "learning_rate": 0.00013658991085080025, + "epoch": 0.8248730964467005, + "step": 650 + }, + { + "loss": 0.0594, + "grad_norm": 0.22256866097450256, + "learning_rate": 0.0001346289328337558, + "epoch": 0.8375634517766497, + "step": 660 + }, + { + "loss": 0.0556, + "grad_norm": 0.20859505236148834, + "learning_rate": 0.00013265270448417234, + "epoch": 0.850253807106599, + "step": 670 + }, + { + "loss": 0.0557, + "grad_norm": 0.2204328030347824, + "learning_rate": 0.00013066209611877746, + "epoch": 0.8629441624365483, + "step": 680 + }, + { + "loss": 0.059, + "grad_norm": 0.2515346109867096, + "learning_rate": 0.00012865798438715413, + "epoch": 0.8756345177664975, + "step": 690 + }, + { + "loss": 0.0546, + "grad_norm": 0.3130325376987457, + "learning_rate": 0.00012664125188567056, + "epoch": 0.8883248730964467, + "step": 700 + }, + { + "loss": 0.0475, + "grad_norm": 0.2509436011314392, + "learning_rate": 0.00012461278676879098, + "epoch": 0.9010152284263959, + "step": 710 + }, + { + "loss": 0.0561, + "grad_norm": 0.23676852881908417, + "learning_rate": 0.00012257348235793897, + "epoch": 0.9137055837563451, + "step": 720 + }, + { + "loss": 0.0536, + "grad_norm": 0.20894668996334076, + "learning_rate": 0.00012052423674808513, + "epoch": 0.9263959390862944, + "step": 730 + }, + { + "loss": 0.0517, + "grad_norm": 0.18107716739177704, + "learning_rate": 0.00011846595241223247, + "epoch": 0.9390862944162437, + "step": 740 + }, + { + "loss": 0.0623, + "grad_norm": 0.3013327717781067, + "learning_rate": 0.00011639953580397367, + "epoch": 0.9517766497461929, + "step": 750 + }, + { + "loss": 0.0579, + "grad_norm": 0.19317802786827087, + "learning_rate": 0.00011432589695829576, + "epoch": 0.9644670050761421, + "step": 760 + }, + { + "loss": 0.0559, + "grad_norm": 0.26291170716285706, + "learning_rate": 0.00011224594909080704, + "epoch": 0.9771573604060914, + "step": 770 + }, + { + "loss": 0.0537, + "grad_norm": 0.28403881192207336, + "learning_rate": 0.00011016060819556353, + "epoch": 0.9898477157360406, + "step": 780 + }, + { + "eval_loss": 0.05360769107937813, + "eval_runtime": 20.0465, + "eval_samples_per_second": 66.196, + "eval_steps_per_second": 16.562, + "epoch": 0.9961928934010152, + "step": 785 + }, + { + "loss": 0.0502, + "grad_norm": 0.1471383273601532, + "learning_rate": 0.0001080707926416719, + "epoch": 1.00253807106599, + "step": 790 + }, + { + "loss": 0.038, + "grad_norm": 0.17716127634048462, + "learning_rate": 0.00010597742276884614, + "epoch": 1.015228426395939, + "step": 800 + }, + { + "loss": 0.0351, + "grad_norm": 0.2006382942199707, + "learning_rate": 0.00010388142048209676, + "epoch": 1.0279187817258884, + "step": 810 + }, + { + "loss": 0.0375, + "grad_norm": 0.2539692521095276, + "learning_rate": 0.00010178370884573046, + "epoch": 1.0406091370558375, + "step": 820 + }, + { + "loss": 0.0422, + "grad_norm": 0.2615308165550232, + "learning_rate": 9.968521167683905e-05, + "epoch": 1.0532994923857868, + "step": 830 + }, + { + "loss": 0.0406, + "grad_norm": 0.23757147789001465, + "learning_rate": 9.758685313845727e-05, + "epoch": 1.0659898477157361, + "step": 840 + }, + { + "loss": 0.0387, + "grad_norm": 0.16979315876960754, + "learning_rate": 9.548955733256803e-05, + "epoch": 1.0786802030456852, + "step": 850 + }, + { + "loss": 0.0352, + "grad_norm": 0.1853126734495163, + "learning_rate": 9.339424789313445e-05, + "epoch": 1.0913705583756346, + "step": 860 + }, + { + "loss": 0.0356, + "grad_norm": 0.15106192231178284, + "learning_rate": 9.13018475793382e-05, + "epoch": 1.1040609137055837, + "step": 870 + }, + { + "loss": 0.037, + "grad_norm": 0.20427311956882477, + "learning_rate": 8.921327786920294e-05, + "epoch": 1.116751269035533, + "step": 880 + }, + { + "loss": 0.0324, + "grad_norm": 0.1580514758825302, + "learning_rate": 8.712945855378218e-05, + "epoch": 1.1294416243654823, + "step": 890 + }, + { + "loss": 0.0301, + "grad_norm": 0.2191898375749588, + "learning_rate": 8.505130733208968e-05, + "epoch": 1.1421319796954315, + "step": 900 + }, + { + "loss": 0.0355, + "grad_norm": 0.16614247858524323, + "learning_rate": 8.297973940695163e-05, + "epoch": 1.1548223350253808, + "step": 910 + }, + { + "loss": 0.0349, + "grad_norm": 0.18907427787780762, + "learning_rate": 8.091566708195786e-05, + "epoch": 1.16751269035533, + "step": 920 + }, + { + "loss": 0.0336, + "grad_norm": 0.24296258389949799, + "learning_rate": 7.885999935968982e-05, + "epoch": 1.1802030456852792, + "step": 930 + }, + { + "loss": 0.0372, + "grad_norm": 0.1817648708820343, + "learning_rate": 7.681364154140264e-05, + "epoch": 1.1928934010152283, + "step": 940 + }, + { + "eval_loss": 0.057017017155885696, + "eval_runtime": 19.9628, + "eval_samples_per_second": 66.474, + "eval_steps_per_second": 16.631, + "epoch": 1.1954314720812182, + "step": 942 + }, + { + "loss": 0.03, + "grad_norm": 0.19095705449581146, + "learning_rate": 7.47774948283366e-05, + "epoch": 1.2055837563451777, + "step": 950 + }, + { + "loss": 0.035, + "grad_norm": 0.33682745695114136, + "learning_rate": 7.275245592483492e-05, + "epoch": 1.218274111675127, + "step": 960 + }, + { + "loss": 0.0384, + "grad_norm": 0.2646084427833557, + "learning_rate": 7.073941664344152e-05, + "epoch": 1.2309644670050761, + "step": 970 + }, + { + "loss": 0.0287, + "grad_norm": 0.1980791836977005, + "learning_rate": 6.873926351215312e-05, + "epoch": 1.2436548223350254, + "step": 980 + }, + { + "loss": 0.0342, + "grad_norm": 0.18797655403614044, + "learning_rate": 6.67528773839989e-05, + "epoch": 1.2563451776649746, + "step": 990 + }, + { + "loss": 0.0337, + "grad_norm": 0.24009937047958374, + "learning_rate": 6.478113304911886e-05, + "epoch": 1.2690355329949239, + "step": 1000 + }, + { + "loss": 0.0272, + "grad_norm": 0.29159170389175415, + "learning_rate": 6.282489884951295e-05, + "epoch": 1.281725888324873, + "step": 1010 + }, + { + "loss": 0.036, + "grad_norm": 0.16352516412734985, + "learning_rate": 6.0885036296629064e-05, + "epoch": 1.2944162436548223, + "step": 1020 + }, + { + "loss": 0.0292, + "grad_norm": 0.17807820439338684, + "learning_rate": 5.896239969195994e-05, + "epoch": 1.3071065989847717, + "step": 1030 + }, + { + "loss": 0.0332, + "grad_norm": 0.2500491738319397, + "learning_rate": 5.7057835750814867e-05, + "epoch": 1.3197969543147208, + "step": 1040 + }, + { + "loss": 0.0294, + "grad_norm": 0.2208271473646164, + "learning_rate": 5.517218322943224e-05, + "epoch": 1.33248730964467, + "step": 1050 + }, + { + "loss": 0.0342, + "grad_norm": 0.23927471041679382, + "learning_rate": 5.3306272555597504e-05, + "epoch": 1.3451776649746192, + "step": 1060 + }, + { + "loss": 0.0307, + "grad_norm": 0.20309758186340332, + "learning_rate": 5.1460925462928546e-05, + "epoch": 1.3578680203045685, + "step": 1070 + }, + { + "loss": 0.0314, + "grad_norm": 0.23275193572044373, + "learning_rate": 4.96369546289904e-05, + "epoch": 1.3705583756345177, + "step": 1080 + }, + { + "loss": 0.0333, + "grad_norm": 0.2078331708908081, + "learning_rate": 4.783516331739769e-05, + "epoch": 1.383248730964467, + "step": 1090 + }, + { + "eval_loss": 0.05335332825779915, + "eval_runtime": 19.9859, + "eval_samples_per_second": 66.397, + "eval_steps_per_second": 16.612, + "epoch": 1.3946700507614214, + "step": 1099 + }, + { + "loss": 0.0309, + "grad_norm": 0.18032079935073853, + "learning_rate": 4.605634502406321e-05, + "epoch": 1.3959390862944163, + "step": 1100 + }, + { + "loss": 0.0328, + "grad_norm": 0.20803005993366241, + "learning_rate": 4.430128312774804e-05, + "epoch": 1.4086294416243654, + "step": 1110 + }, + { + "loss": 0.027, + "grad_norm": 0.1680465191602707, + "learning_rate": 4.2570750545067076e-05, + "epoch": 1.4213197969543148, + "step": 1120 + }, + { + "loss": 0.0317, + "grad_norm": 0.2528463900089264, + "learning_rate": 4.086550939010227e-05, + "epoch": 1.434010152284264, + "step": 1130 + }, + { + "loss": 0.0313, + "grad_norm": 0.19024434685707092, + "learning_rate": 3.9186310638773047e-05, + "epoch": 1.4467005076142132, + "step": 1140 + }, + { + "loss": 0.0287, + "grad_norm": 0.20934472978115082, + "learning_rate": 3.753389379811185e-05, + "epoch": 1.4593908629441623, + "step": 1150 + }, + { + "loss": 0.0265, + "grad_norm": 0.29412180185317993, + "learning_rate": 3.590898658059062e-05, + "epoch": 1.4720812182741116, + "step": 1160 + }, + { + "loss": 0.0298, + "grad_norm": 0.3268195390701294, + "learning_rate": 3.4312304583641484e-05, + "epoch": 1.484771573604061, + "step": 1170 + }, + { + "loss": 0.0251, + "grad_norm": 0.17332251369953156, + "learning_rate": 3.274455097451269e-05, + "epoch": 1.49746192893401, + "step": 1180 + }, + { + "loss": 0.0318, + "grad_norm": 0.3481772541999817, + "learning_rate": 3.1206416180598995e-05, + "epoch": 1.5101522842639594, + "step": 1190 + }, + { + "loss": 0.0335, + "grad_norm": 0.24047453701496124, + "learning_rate": 2.9698577585382282e-05, + "epoch": 1.5228426395939088, + "step": 1200 + }, + { + "loss": 0.0339, + "grad_norm": 0.21146714687347412, + "learning_rate": 2.8221699230116793e-05, + "epoch": 1.5355329949238579, + "step": 1210 + }, + { + "loss": 0.0308, + "grad_norm": 0.140832781791687, + "learning_rate": 2.67764315213902e-05, + "epoch": 1.548223350253807, + "step": 1220 + }, + { + "loss": 0.026, + "grad_norm": 0.1721792370080948, + "learning_rate": 2.536341094468906e-05, + "epoch": 1.5609137055837563, + "step": 1230 + }, + { + "loss": 0.0277, + "grad_norm": 0.14980490505695343, + "learning_rate": 2.398325978409539e-05, + "epoch": 1.5736040609137056, + "step": 1240 + }, + { + "loss": 0.028, + "grad_norm": 0.18908673524856567, + "learning_rate": 2.263658584823717e-05, + "epoch": 1.5862944162436547, + "step": 1250 + }, + { + "eval_loss": 0.052472274750471115, + "eval_runtime": 19.9786, + "eval_samples_per_second": 66.421, + "eval_steps_per_second": 16.618, + "epoch": 1.5939086294416245, + "step": 1256 + }, + { + "loss": 0.0272, + "grad_norm": 0.12164825201034546, + "learning_rate": 2.1323982202613735e-05, + "epoch": 1.598984771573604, + "step": 1260 + }, + { + "loss": 0.0245, + "grad_norm": 0.2658851146697998, + "learning_rate": 2.004602690841414e-05, + "epoch": 1.6116751269035534, + "step": 1270 + }, + { + "loss": 0.0304, + "grad_norm": 0.2891974151134491, + "learning_rate": 1.8803282767942954e-05, + "epoch": 1.6243654822335025, + "step": 1280 + }, + { + "loss": 0.0292, + "grad_norm": 0.2979351580142975, + "learning_rate": 1.7596297076766455e-05, + "epoch": 1.6370558375634516, + "step": 1290 + }, + { + "loss": 0.0284, + "grad_norm": 0.20141719281673431, + "learning_rate": 1.6425601382687405e-05, + "epoch": 1.649746192893401, + "step": 1300 + }, + { + "loss": 0.0254, + "grad_norm": 0.1950131356716156, + "learning_rate": 1.5291711251655316e-05, + "epoch": 1.6624365482233503, + "step": 1310 + }, + { + "loss": 0.0282, + "grad_norm": 0.21205022931098938, + "learning_rate": 1.41951260407149e-05, + "epoch": 1.6751269035532994, + "step": 1320 + }, + { + "loss": 0.0247, + "grad_norm": 0.2470894753932953, + "learning_rate": 1.3136328678092746e-05, + "epoch": 1.6878172588832487, + "step": 1330 + }, + { + "loss": 0.0257, + "grad_norm": 0.26378998160362244, + "learning_rate": 1.2115785450519434e-05, + "epoch": 1.700507614213198, + "step": 1340 + }, + { + "loss": 0.0282, + "grad_norm": 0.12680888175964355, + "learning_rate": 1.1133945797879908e-05, + "epoch": 1.7131979695431472, + "step": 1350 + }, + { + "loss": 0.0251, + "grad_norm": 0.19744935631752014, + "learning_rate": 1.019124211528365e-05, + "epoch": 1.7258883248730963, + "step": 1360 + }, + { + "loss": 0.0327, + "grad_norm": 0.18419434130191803, + "learning_rate": 9.288089562640844e-06, + "epoch": 1.7385786802030458, + "step": 1370 + }, + { + "loss": 0.0282, + "grad_norm": 0.19115136563777924, + "learning_rate": 8.42488588182897e-06, + "epoch": 1.751269035532995, + "step": 1380 + }, + { + "loss": 0.0245, + "grad_norm": 0.17252641916275024, + "learning_rate": 7.602011221530236e-06, + "epoch": 1.763959390862944, + "step": 1390 + }, + { + "loss": 0.029, + "grad_norm": 0.22253695130348206, + "learning_rate": 6.819827969816661e-06, + "epoch": 1.7766497461928934, + "step": 1400 + }, + { + "loss": 0.0269, + "grad_norm": 0.21938475966453552, + "learning_rate": 6.078680594557163e-06, + "epoch": 1.7893401015228427, + "step": 1410 + }, + { + "eval_loss": 0.05091211572289467, + "eval_runtime": 20.0145, + "eval_samples_per_second": 66.302, + "eval_steps_per_second": 16.588, + "epoch": 1.7931472081218274, + "step": 1413 + }, + { + "loss": 0.0305, + "grad_norm": 0.2024271935224533, + "learning_rate": 5.378895491716285e-06, + "epoch": 1.8020304568527918, + "step": 1420 + }, + { + "loss": 0.029, + "grad_norm": 0.22723488509655, + "learning_rate": 4.720780841611738e-06, + "epoch": 1.8147208121827412, + "step": 1430 + }, + { + "loss": 0.0266, + "grad_norm": 0.2747625410556793, + "learning_rate": 4.104626473194151e-06, + "epoch": 1.8274111675126905, + "step": 1440 + }, + { + "loss": 0.0262, + "grad_norm": 0.18593831360340118, + "learning_rate": 3.5307037364083253e-06, + "epoch": 1.8401015228426396, + "step": 1450 + }, + { + "loss": 0.0291, + "grad_norm": 0.2651998996734619, + "learning_rate": 2.9992653826927508e-06, + "epoch": 1.8527918781725887, + "step": 1460 + }, + { + "loss": 0.026, + "grad_norm": 0.19439752399921417, + "learning_rate": 2.510545453669744e-06, + "epoch": 1.865482233502538, + "step": 1470 + }, + { + "loss": 0.03, + "grad_norm": 0.17483021318912506, + "learning_rate": 2.06475917807506e-06, + "epoch": 1.8781725888324874, + "step": 1480 + }, + { + "loss": 0.029, + "grad_norm": 0.22444817423820496, + "learning_rate": 1.662102876972882e-06, + "epoch": 1.8908629441624365, + "step": 1490 + }, + { + "loss": 0.0243, + "grad_norm": 0.17885605990886688, + "learning_rate": 1.3027538772973026e-06, + "epoch": 1.9035532994923858, + "step": 1500 + }, + { + "loss": 0.0272, + "grad_norm": 0.19312232732772827, + "learning_rate": 9.868704337588797e-07, + "epoch": 1.9162436548223352, + "step": 1510 + }, + { + "loss": 0.0254, + "grad_norm": 0.1709776520729065, + "learning_rate": 7.145916591504098e-07, + "epoch": 1.9289340101522843, + "step": 1520 + }, + { + "loss": 0.0252, + "grad_norm": 0.18656505644321442, + "learning_rate": 4.860374630826004e-07, + "epoch": 1.9416243654822334, + "step": 1530 + }, + { + "loss": 0.0267, + "grad_norm": 0.11956395953893661, + "learning_rate": 3.0130849917681114e-07, + "epoch": 1.9543147208121827, + "step": 1540 + }, + { + "loss": 0.0305, + "grad_norm": 0.25038954615592957, + "learning_rate": 1.604861207378794e-07, + "epoch": 1.967005076142132, + "step": 1550 + }, + { + "loss": 0.025, + "grad_norm": 0.19318363070487976, + "learning_rate": 6.363234492674507e-08, + "epoch": 1.9796954314720812, + "step": 1560 + }, + { + "loss": 0.0276, + "grad_norm": 0.26012641191482544, + "learning_rate": 1.0789825448476177e-08, + "epoch": 1.9923857868020305, + "step": 1570 + }, + { + "eval_loss": 0.0504293330013752, + "eval_runtime": 19.8337, + "eval_samples_per_second": 66.906, + "eval_steps_per_second": 16.739, + "epoch": 1.9923857868020305, + "step": 1570 + }, + { + "train_runtime": 2681.8444, + "train_samples_per_second": 18.795, + "train_steps_per_second": 0.588, + "total_flos": 6.800278675429786e+17, + "train_loss": 0.06838441643920647, + "epoch": 2.0, + "step": 1576 + } +] \ No newline at end of file diff --git a/train/training_loss.png b/train/training_loss.png new file mode 100644 index 0000000..0855eb3 Binary files /dev/null and b/train/training_loss.png differ diff --git a/train/validation_loss.png b/train/validation_loss.png new file mode 100644 index 0000000..5ca1ed5 Binary files /dev/null and b/train/validation_loss.png differ