Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -51,7 +51,7 @@ from vllm.v1.attention.backend import AttentionType
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
-from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix, reparse_quant_config
+from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix

 logger = init_logger(__name__)

@@ -142,7 +142,6 @@ class Qwen3Attention(nn.Module):
        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.qk_norm = RMSNormQK(self.head_dim, self.head_dim,  eps=rms_norm_eps)
-
    def forward(
        self,
        positions: torch.Tensor,
@@ -150,17 +149,19 @@ class Qwen3Attention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        # # Add qk-norm
+        # Add qk-norm
        # q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
-        # q_by_head = self.q_norm(q_by_head)
+        # q_by_head = self.q_norm.forward_native(q_by_head) # TODO(gyf) check why
        # q = q_by_head.view(q.shape)
        # k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
-        # k_by_head = self.k_norm(k_by_head)
+        # k_by_head = self.k_norm.forward_native(k_by_head)
        # k = k_by_head.view(k.shape)
-
-        q_by_head = q.view(*q.shape[:-1], q.shape[-1]//self.head_dim, self.head_dim)
-        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
-
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = k.view(*k.shape[:-1],
+                           k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        
        out_q, out_k = self.qk_norm(
            q_by_head,
            k_by_head,
@@ -170,7 +171,6 @@ class Qwen3Attention(nn.Module):

        q = out_q.view(q.shape)
        k = out_k.view(k.shape)
-
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)
        output, _ = self.o_proj(attn_output)
@@ -201,8 +201,6 @@ class Qwen3DecoderLayer(nn.Module):
        else:
            attn_type = AttentionType.ENCODER_ONLY

-        quant_config = reparse_quant_config(prefix, quant_config)
-
        self.self_attn = Qwen3Attention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
@@ -236,23 +234,25 @@ class Qwen3DecoderLayer(nn.Module):
        hidden_states: torch.Tensor,
        residual: torch.Tensor | None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
        # Self Attention
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
        hidden_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,
        )

        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        return hidden_states, residual

-
 ALL_DECODER_LAYER_TYPES = {
    "attention": Qwen3DecoderLayer,
 }