Update comment in qwen2.py (#4447)

This commit is contained in:
Michael Feil
2025-03-15 21:14:29 -07:00
committed by GitHub
parent bf63ee54ed
commit 1fd0cf8a7b

View File

@@ -170,6 +170,7 @@ class Qwen2MoeAttention(nn.Module):
rope_theta: float = 10000,
rope_scaling: Optional[Dict[str, Any]] = None,
max_position_embeddings: int = 8192,
qkv_bias: int = True,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
@@ -201,7 +202,7 @@ class Qwen2MoeAttention(nn.Module):
self.head_dim,
self.total_num_heads,
self.total_num_kv_heads,
bias=True,
bias=qkv_bias,
quant_config=quant_config,
prefix=add_prefix("qkv_proj", prefix),
)
@@ -257,6 +258,8 @@ class Qwen2MoeDecoderLayer(nn.Module):
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# note: replace config.num_hidden_layers < 80 with True once its available in transformers 4.50.0
qkv_bias = getattr(config, "qkv_bias", config.num_hidden_layers < 80)
self.self_attn = Qwen2MoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
@@ -266,6 +269,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
qkv_bias=qkv_bias,
prefix=add_prefix("self_attn", prefix),
)