update modeling file to newest

This commit is contained in:
x54-729
2024-08-20 06:49:50 +00:00
parent 5729cef948
commit ece3aef931
2 changed files with 814 additions and 368 deletions

View File

@@ -44,9 +44,9 @@ class InternLM2Config(PretrainedConfig):
intermediate_size (`int`, *optional*, defaults to 11008): intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations. Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32): num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32): num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*): num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
@@ -58,22 +58,42 @@ class InternLM2Config(PretrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder. The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048): max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. InternLM2 supports up to 32768 tokens.
just in case (e.g., 512 or 1024 or 2048).
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12): rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers. The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`): pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism)
to understand more about it. This value is necessary to ensure exact reproducibility
of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings Whether to tie weight embeddings
Example: rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
""" """
model_type = "internlm2"
_auto_class = "AutoConfig" _auto_class = "AutoConfig"
model_type = "internlm2"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__( # pylint: disable=W0102 def __init__( # pylint: disable=W0102
self, self,
@@ -91,11 +111,12 @@ class InternLM2Config(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
bos_token_id=1, bos_token_id=1,
eos_token_id=2, eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False, tie_word_embeddings=False,
bias=True, bias=True,
rope_theta=10000, rope_theta=10000,
rope_scaling=None, rope_scaling=None,
attn_implementation="eager", attn_implementation=None,
**kwargs, **kwargs,
): ):
self.vocab_size = vocab_size self.vocab_size = vocab_size
@@ -113,14 +134,15 @@ class InternLM2Config(PretrainedConfig):
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta self.rope_theta = rope_theta
self.rope_scaling = rope_scaling self.rope_scaling = rope_scaling
self._rope_scaling_validation() self._rope_scaling_validation()
self.attn_implementation = attn_implementation self.attn_implementation = attn_implementation
if self.attn_implementation is None: if self.attn_implementation is None:
self.attn_implementation = "eager" self.attn_implementation = "eager"
super().__init__( super().__init__(
pad_token_id=pad_token_id, pad_token_id=pad_token_id,
bos_token_id=bos_token_id, bos_token_id=bos_token_id,
@@ -147,5 +169,12 @@ class InternLM2Config(PretrainedConfig):
raise ValueError( raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: if (
raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") rope_scaling_factor is None
or not isinstance(rope_scaling_factor, (float, int))
or rope_scaling_factor < 1.0
):
raise ValueError(
f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
f"of type {type(rope_scaling_factor)}"
)

File diff suppressed because it is too large Load Diff