Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -77,7 +77,7 @@ from .utils import (
)
logger = init_logger(__name__)
import ixformer.inference.functions as ixf_ops
class Qwen3MoeMLP(nn.Module):
def __init__(
@@ -170,7 +170,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
config.hidden_size,
config.num_experts,
bias=False,
quant_config=quant_config,
quant_config=None,
prefix=f"{prefix}.gate",
)
@@ -338,13 +338,14 @@ class Qwen3MoeAttention(nn.Module):
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
# Add qk-norm
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
q_by_head = self.q_norm(q_by_head)
q = q_by_head.view(q.shape)
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
self.head_dim)
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
self.head_dim)
out_q, out_k = ixf_ops.rms_norm_qk(q_by_head, k_by_head, self.q_norm.weight.data, self.k_norm.weight.data, self.q_norm.variance_epsilon)
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
k_by_head = self.k_norm(k_by_head)
k = k_by_head.view(k.shape)
q = out_q.view(q.shape)
k = out_k.view(k.shape)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)
@@ -379,6 +380,12 @@ class Qwen3MoeDecoderLayer(nn.Module):
dual_chunk_attention_config=dual_chunk_attention_config,
)
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsW8A8Int8
if hasattr(self.self_attn.qkv_proj, "scheme") and isinstance(self.self_attn.qkv_proj.scheme, CompressedTensorsW8A8Int8):
self.fused_norm_quant = True
else:
self.fused_norm_quant = False
# `mlp_only_layers` in the config.
layer_idx = extract_layer_index(prefix)
mlp_only_layers = (
@@ -409,12 +416,23 @@ class Qwen3MoeDecoderLayer(nn.Module):
hidden_states: torch.Tensor,
residual: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor]:
# Self Attention
if residual is None:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
if self.fused_norm_quant:
origin_input = hidden_states
hidden_states_i8, residual, scale = ixf_ops.residual_rms_norm_dynamic_int8(
hidden_states, self.input_layernorm.weight.data, residual,
eps=self.input_layernorm.variance_epsilon,
)
hidden_states = (hidden_states_i8, scale, hidden_states.dtype)
if residual is None:
residual = origin_input
else:
hidden_states, residual = self.input_layernorm(hidden_states, residual)
# Self Attention
if residual is None:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
else:
hidden_states, residual = self.input_layernorm(
hidden_states, residual)
hidden_states = self.self_attn(
positions=positions,
hidden_states=hidden_states,