Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -51,7 +51,7 @@ from vllm.v1.attention.backend import AttentionType
|
||||
from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
|
||||
from .qwen2 import Qwen2MLP as Qwen3MLP
|
||||
from .qwen2 import Qwen2Model
|
||||
from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix, reparse_quant_config
|
||||
from .utils import AutoWeightsLoader, PPMissingLayer, extract_layer_index, maybe_prefix
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -142,7 +142,6 @@ class Qwen3Attention(nn.Module):
|
||||
self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
|
||||
self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
|
||||
self.qk_norm = RMSNormQK(self.head_dim, self.head_dim, eps=rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
@@ -150,17 +149,19 @@ class Qwen3Attention(nn.Module):
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
# # Add qk-norm
|
||||
# Add qk-norm
|
||||
# q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
|
||||
# q_by_head = self.q_norm(q_by_head)
|
||||
# q_by_head = self.q_norm.forward_native(q_by_head) # TODO(gyf) check why
|
||||
# q = q_by_head.view(q.shape)
|
||||
# k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
|
||||
# k_by_head = self.k_norm(k_by_head)
|
||||
# k_by_head = self.k_norm.forward_native(k_by_head)
|
||||
# k = k_by_head.view(k.shape)
|
||||
|
||||
q_by_head = q.view(*q.shape[:-1], q.shape[-1]//self.head_dim, self.head_dim)
|
||||
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
|
||||
|
||||
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
|
||||
self.head_dim)
|
||||
k_by_head = k.view(*k.shape[:-1],
|
||||
k.shape[-1] // self.head_dim,
|
||||
self.head_dim)
|
||||
|
||||
out_q, out_k = self.qk_norm(
|
||||
q_by_head,
|
||||
k_by_head,
|
||||
@@ -170,7 +171,6 @@ class Qwen3Attention(nn.Module):
|
||||
|
||||
q = out_q.view(q.shape)
|
||||
k = out_k.view(k.shape)
|
||||
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
@@ -201,8 +201,6 @@ class Qwen3DecoderLayer(nn.Module):
|
||||
else:
|
||||
attn_type = AttentionType.ENCODER_ONLY
|
||||
|
||||
quant_config = reparse_quant_config(prefix, quant_config)
|
||||
|
||||
self.self_attn = Qwen3Attention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
@@ -236,23 +234,25 @@ class Qwen3DecoderLayer(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
else:
|
||||
hidden_states, residual = self.input_layernorm(hidden_states, residual)
|
||||
hidden_states, residual = self.input_layernorm(
|
||||
hidden_states, residual)
|
||||
hidden_states = self.self_attn(
|
||||
positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
|
||||
# Fully Connected
|
||||
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
||||
hidden_states, residual = self.post_attention_layernorm(
|
||||
hidden_states, residual)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
return hidden_states, residual
|
||||
|
||||
|
||||
ALL_DECODER_LAYER_TYPES = {
|
||||
"attention": Qwen3DecoderLayer,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user