Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -77,7 +77,7 @@ from .utils import (
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
import ixformer.inference.functions as ixf_ops
|
||||
|
||||
class Qwen3MoeMLP(nn.Module):
|
||||
def __init__(
|
||||
@@ -170,7 +170,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
||||
config.hidden_size,
|
||||
config.num_experts,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
quant_config=None,
|
||||
prefix=f"{prefix}.gate",
|
||||
)
|
||||
|
||||
@@ -338,13 +338,14 @@ class Qwen3MoeAttention(nn.Module):
|
||||
qkv, _ = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
# Add qk-norm
|
||||
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
|
||||
q_by_head = self.q_norm(q_by_head)
|
||||
q = q_by_head.view(q.shape)
|
||||
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
|
||||
self.head_dim)
|
||||
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
|
||||
self.head_dim)
|
||||
out_q, out_k = ixf_ops.rms_norm_qk(q_by_head, k_by_head, self.q_norm.weight.data, self.k_norm.weight.data, self.q_norm.variance_epsilon)
|
||||
|
||||
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
|
||||
k_by_head = self.k_norm(k_by_head)
|
||||
k = k_by_head.view(k.shape)
|
||||
q = out_q.view(q.shape)
|
||||
k = out_k.view(k.shape)
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
@@ -379,6 +380,12 @@ class Qwen3MoeDecoderLayer(nn.Module):
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
)
|
||||
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsW8A8Int8
|
||||
if hasattr(self.self_attn.qkv_proj, "scheme") and isinstance(self.self_attn.qkv_proj.scheme, CompressedTensorsW8A8Int8):
|
||||
self.fused_norm_quant = True
|
||||
else:
|
||||
self.fused_norm_quant = False
|
||||
|
||||
# `mlp_only_layers` in the config.
|
||||
layer_idx = extract_layer_index(prefix)
|
||||
mlp_only_layers = (
|
||||
@@ -409,12 +416,23 @@ class Qwen3MoeDecoderLayer(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
if self.fused_norm_quant:
|
||||
origin_input = hidden_states
|
||||
hidden_states_i8, residual, scale = ixf_ops.residual_rms_norm_dynamic_int8(
|
||||
hidden_states, self.input_layernorm.weight.data, residual,
|
||||
eps=self.input_layernorm.variance_epsilon,
|
||||
)
|
||||
hidden_states = (hidden_states_i8, scale, hidden_states.dtype)
|
||||
if residual is None:
|
||||
residual = origin_input
|
||||
else:
|
||||
hidden_states, residual = self.input_layernorm(hidden_states, residual)
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
else:
|
||||
hidden_states, residual = self.input_layernorm(
|
||||
hidden_states, residual)
|
||||
hidden_states = self.self_attn(
|
||||
positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
|
||||
Reference in New Issue
Block a user