Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -77,7 +77,7 @@ from .utils import (
 )

 logger = init_logger(__name__)
-
+import ixformer.inference.functions as ixf_ops

 class Qwen3MoeMLP(nn.Module):
    def __init__(
@@ -170,7 +170,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
            config.hidden_size,
            config.num_experts,
            bias=False,
-            quant_config=quant_config,
+            quant_config=None,
            prefix=f"{prefix}.gate",
        )

@@ -338,13 +338,14 @@ class Qwen3MoeAttention(nn.Module):
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        # Add qk-norm
-        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
-        q_by_head = self.q_norm(q_by_head)
-        q = q_by_head.view(q.shape)
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        out_q, out_k = ixf_ops.rms_norm_qk(q_by_head, k_by_head, self.q_norm.weight.data, self.k_norm.weight.data, self.q_norm.variance_epsilon)

-        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
-        k_by_head = self.k_norm(k_by_head)
-        k = k_by_head.view(k.shape)
+        q = out_q.view(q.shape)
+        k = out_k.view(k.shape)
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)
        output, _ = self.o_proj(attn_output)
@@ -379,6 +380,12 @@ class Qwen3MoeDecoderLayer(nn.Module):
            dual_chunk_attention_config=dual_chunk_attention_config,
        )

+        from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsW8A8Int8
+        if hasattr(self.self_attn.qkv_proj, "scheme") and isinstance(self.self_attn.qkv_proj.scheme, CompressedTensorsW8A8Int8):
+            self.fused_norm_quant = True
+        else:
+            self.fused_norm_quant = False
+
        # `mlp_only_layers` in the config.
        layer_idx = extract_layer_index(prefix)
        mlp_only_layers = (
@@ -409,12 +416,23 @@ class Qwen3MoeDecoderLayer(nn.Module):
        hidden_states: torch.Tensor,
        residual: torch.Tensor | None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
+        if self.fused_norm_quant:
+            origin_input = hidden_states
+            hidden_states_i8, residual, scale = ixf_ops.residual_rms_norm_dynamic_int8(
+                hidden_states, self.input_layernorm.weight.data, residual,
+                eps=self.input_layernorm.variance_epsilon,
+            )
+            hidden_states = (hidden_states_i8, scale, hidden_states.dtype)
+            if residual is None:
+                residual = origin_input
        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            # Self Attention
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
        hidden_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,