diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index b5ef472ce..5c3c8d612 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -170,6 +170,7 @@ class Qwen2MoeAttention(nn.Module): rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, + qkv_bias: int = True, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -201,7 +202,7 @@ class Qwen2MoeAttention(nn.Module): self.head_dim, self.total_num_heads, self.total_num_kv_heads, - bias=True, + bias=qkv_bias, quant_config=quant_config, prefix=add_prefix("qkv_proj", prefix), ) @@ -257,6 +258,8 @@ class Qwen2MoeDecoderLayer(nn.Module): rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + # note: replace config.num_hidden_layers < 80 with True once its available in transformers 4.50.0 + qkv_bias = getattr(config, "qkv_bias", config.num_hidden_layers < 80) self.self_attn = Qwen2MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -266,6 +269,7 @@ class Qwen2MoeDecoderLayer(nn.Module): rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, + qkv_bias=qkv_bias, prefix=add_prefix("self_attn", prefix), )