commit dea3b47681495dc02fc9963dcabcc50ad1cac784 Author: ModelHub XC Date: Fri Jun 19 07:55:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: KnutJaegersberg/LLongMA-3b-LIMA Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..20d49d0 --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +--- +license: cc-by-nc-4.0 +--- +Prompt example: +``` +### Instruction: +How do you fine tune a large language model? +### Response: +``` +# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) +Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_KnutJaegersberg__LLongMA-3b-LIMA) + +| Metric | Value | +|-----------------------|---------------------------| +| Avg. | 33.66 | +| ARC (25-shot) | 39.08 | +| HellaSwag (10-shot) | 67.15 | +| MMLU (5-shot) | 26.43 | +| TruthfulQA (0-shot) | 34.71 | +| Winogrande (5-shot) | 63.38 | +| GSM8K (5-shot) | 0.3 | +| DROP (3-shot) | 4.57 | diff --git a/config.json b/config.json new file mode 100644 index 0000000..cabf06e --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "_name_or_path": "/run/media/knut/HD/huggingface models/language models/llama-alternatives/LLongMA-3b/", + "architectures": [ + "LlamaForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_llama.LlamaConfig", + "AutoModel": "modeling_llama.LlamaModel", + "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM", + "AutoModelForSequenceClassification": "modeling_llama.LlamaForSequenceClassification" + }, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 3200, + "initializer_range": 0.02, + "intermediate_size": 8640, + "max_position_embeddings": 8192, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 26, + "num_key_value_heads": 32, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "factor": 4.0, + "type": "linear" + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.32.0.dev0", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/configuration_llama.py b/configuration_llama.py new file mode 100644 index 0000000..f326ffd --- /dev/null +++ b/configuration_llama.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" LLaMA model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class LlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the LLaMA-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling + strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format + is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + + Example: + + ```python + >>> from transformers import LlamaModel, LlamaConfig + + >>> # Initializing a LLaMA llama-7b style configuration + >>> configuration = LlamaConfig() + + >>> # Initializing a model from the llama-7b style configuration + >>> model = LlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "llama" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_scaling=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]: + raise ValueError( + f"`rope_scaling`'s name field must be one of ['linear', 'dynamic', 'ntk'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..8c88a8f --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.32.0.dev0" +} diff --git a/modeling_llama.py b/modeling_llama.py new file mode 100644 index 0000000..c82ae5e --- /dev/null +++ b/modeling_llama.py @@ -0,0 +1,1016 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch LLaMA model.""" +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_llama import LlamaConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlamaConfig" + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + +def _find_correction_factor(num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings/(num_rotations * 2 * math.pi)))/(2 * math.log(base)) #Inverse dim formula to find number of rotations + +def _find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): + low = math.floor(_find_correction_factor(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(_find_correction_factor(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim-1) #Clamp values just in case + +def _linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 #Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + +def _find_newbase_ntk(dim, base=10000, scale=1): + return base * scale ** (dim / (dim-2)) + +def _ntk_build_inv_freq(dim, base, scaling_factor, ntk_factor, extrapolation_factor, original_max_position_embeddings, device): + #Interpolation constants found experimentally for LLaMA (might not be totally optimal though) + #Do not change unless there is a good reason for doing so! + beta_0 = 1.25 + beta_1 = 0.75 + gamma_0 = 16 + gamma_1 = 2 + + #Three RoPE extrapolation/interpolation methods + inv_freq_base = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) + inv_freq_linear = 1.0 / (scaling_factor * (base ** (torch.arange(0, dim, 2).float().to(device) / dim))) + inv_freq_ntk = 1.0 / (_find_newbase_ntk(dim, base, scaling_factor) ** (torch.arange(0, dim, 2).float().to(device) / dim)) + + current_dtype = inv_freq_ntk.dtype + current_device = inv_freq_ntk.device + + #Combine NTK and Linear + low, high = _find_correction_range(beta_0, beta_1, dim, base, original_max_position_embeddings) + inv_freq_mask = (1 - _linear_ramp_mask(low, high, dim // 2).type(current_dtype).to(current_device)) * ntk_factor + inv_freq = inv_freq_linear * (1 - inv_freq_mask) + inv_freq_ntk * inv_freq_mask + + #Combine Extrapolation and NTK and Linear + low, high = _find_correction_range(gamma_0, gamma_1, dim, base, original_max_position_embeddings) + inv_freq_mask = (1 - _linear_ramp_mask(low, high, dim // 2).type(current_dtype).to(current_device)) * extrapolation_factor + inv_freq = inv_freq * (1 - inv_freq_mask) + inv_freq_base * inv_freq_mask + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + return (self.weight * hidden_states).to(input_dtype) + + +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + +class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + +class LlamaNTKRotaryEmbedding(LlamaRotaryEmbedding): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0, ntk_factor=1.0, extrapolation_factor=1.0, original_max_position_embeddings=2048): + super().__init__(dim, max_position_embeddings, base, device) + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + + inv_freq = _ntk_build_inv_freq(dim, base, scaling_factor, ntk_factor, extrapolation_factor, original_max_position_embeddings, device) + self.register_buffer("inv_freq", inv_freq) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class LlamaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = LlamaLinearScalingRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + ) + elif scaling_type == "dynamic": + self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + ) + elif scaling_type == "ntk": + self.rotary_emb = LlamaNTKRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + dtype_min = torch.tensor( + torch.finfo(attn_weights.dtype).min, device=attn_weights.device, dtype=attn_weights.dtype + ) + attn_weights = torch.max(attn_weights, dtype_min) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = LlamaAttention(config=config) + self.mlp = LlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`LlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaPreTrainedModel(PreTrainedModel): + config_class = LlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, LlamaModel): + module.gradient_checkpointing = value + + +LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaModel(LlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] + + Args: + config: LlamaConfig + """ + + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class LlamaForCausalLM(LlamaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + LLAMA_START_DOCSTRING, +) +class LlamaForSequenceClassification(LlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = LlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) \ No newline at end of file diff --git a/pytorch_model-00001-of-00002.bin b/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000..0c1c96f --- /dev/null +++ b/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a8a95a555a96ad43f780e0780313432c693859fad875dab0e8b9e2288e36ca +size 9990691091 diff --git a/pytorch_model-00002-of-00002.bin b/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000..eb1fb65 --- /dev/null +++ b/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d2a8c8d9319e9d6d34069bbb45881852804059e60abdea94e756835700698b9 +size 3715284377 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..6b31863 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,244 @@ +{ + "metadata": { + "total_size": 13705894400 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00002-of-00002.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.norm.weight": "pytorch_model-00002-of-00002.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..d85ba6c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..8d6be7d --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab1b681ec7fc02fed5edd3026687d7a692a918c4dd8e150ca2e3994a6229843b +size 534194 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8eb8ebb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,33 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "model_max_length": 8192, + "pad_token": null, + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +}