upgrade to vllm 0.11.2 (#4400)

Bump vLLM version to v0.11.2 What's broken and changed by vLLM: 1. structured_output is broken by https://github.com/vllm-project/vllm/pull/26866 2. get_mrope_input_positions is broken by https://github.com/vllm-project/vllm/pull/28399 3. graph mode is broken by https://github.com/vllm-project/vllm/pull/25110 we'll upgrade torch to 2.8 to fix the problem later 4. embedding is broken by https://github.com/vllm-project/vllm/pull/27583 5. `get_attn_backend_cls` and attention backend is broken are broken by https://github.com/vllm-project/vllm/pull/28534 6. spec decode is broken by https://github.com/vllm-project/vllm/pull/28771 7. sp feature is broken by https://github.com/vllm-project/vllm/pull/27126 8. mtp is broken by https://github.com/vllm-project/vllm/pull/27922 9. lora is broken by https://github.com/vllm-project/vllm/pull/21068 10. execute_model is broken by https://github.com/vllm-project/vllm/pull/26866 11. `VLLM_DISABLE_SHARED_EXPERTS_STREAM` env is broken by https://github.com/vllm-project/vllm/pull/28159 12. kv cahe is broken by https://github.com/vllm-project/vllm/pull/27753 13. dp is broken by https://github.com/vllm-project/vllm/pull/25110 What's broken and changed by ourself: 1. qwen vl is broken by https://github.com/vllm-project/vllm/pull/28455 We'll remove model files in the future to avoid this kind of error 2. Engine core is broken by https://github.com/vllm-project/vllm/pull/23691 We'll remove the patch file in the future. 3. Ascend scheduler is broken by https://github.com/vllm-project/vllm/pull/28733 We'll remove ascend scheudler later. 4. qwen3-next is broken by https://github.com/vllm-project/vllm/pull/28083 We'll remove model files in the future to avoid this kind of error 5. qwen vl is broken by https://github.com/vllm-project/vllm/pull/27764. We'll remove model files in the future Known issue: 1. ray doesn't work 2. the accuracy of qwen3-next is not correct 3. qwen3-vl is broken 4. prefix cache+ ascend scheduler + deepseek v2 lite is broken. Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: 22dimensions <waitingwind@foxmail.com> Co-authored-by: shen-shanshan <467638484@qq.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com>
2025-11-26 11:48:58 +08:00
parent d5f77f14d0
commit bc69d7cfe1
54 changed files with 744 additions and 437 deletions
--- a/vllm_ascend/models/qwen2_5_vl.py
+++ b/vllm_ascend/models/qwen2_5_vl.py
@@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2_5_vl import (
    Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration,
    Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo)
 from vllm.model_executor.models.utils import maybe_prefix
-from vllm.model_executor.models.vision import conv3d_to_linear_weight
 from vllm.multimodal import MULTIMODAL_REGISTRY

 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
@@ -144,8 +143,14 @@ class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock):
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ) -> None:
-        super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer,
-                         quant_config, prefix)
+        super().__init__(dim=dim,
+                         num_heads=num_heads,
+                         mlp_hidden_dim=mlp_hidden_dim,
+                         act_fn=act_fn,
+                         norm_layer=norm_layer,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
        self.attn = AscendQwen2_5_VisionAttention(embed_dim=dim,
                                                  num_heads=num_heads,
                                                  projection_size=dim,
@@ -161,14 +166,6 @@ class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock):
        return x


-class AscendQwen2_5_VisionPatchEmbed(Qwen2_5_VisionPatchEmbed):
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x.matmul(
-            self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1))
-        return x
-
-
 class AscendQwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):

    def __init__(self, dim: int, theta: float = 10000.0) -> None:
@@ -195,7 +192,7 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
        head_dim = self.hidden_size // self.num_heads
        self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim //
                                                                  2)
-        self.patch_embed = AscendQwen2_5_VisionPatchEmbed(
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
            patch_size=vision_config.patch_size,
            temporal_patch_size=vision_config.temporal_patch_size,
            in_channels=vision_config.in_channels,
@@ -357,8 +354,6 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        loaded_params: Set[str] = set()
        for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
--- a/vllm_ascend/models/qwen2_5_vl_without_padding.py
+++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py
@@ -146,8 +146,13 @@ class AscendQwen2_5_VisionBlock_Without_Padding(Qwen2_5_VisionBlock):
                 norm_layer: Optional[Callable[[int], nn.Module]] = None,
                 quant_config: Optional[QuantizationConfig] = None,
                 prefix: str = "") -> None:
-        super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer,
-                         quant_config, prefix)
+        super().__init__(dim=dim,
+                         num_heads=num_heads,
+                         mlp_hidden_dim=mlp_hidden_dim,
+                         act_fn=act_fn,
+                         norm_layer=norm_layer,
+                         quant_config=quant_config,
+                         prefix=prefix)
        self.attn = AscendQwen2_5_VisionAttention_Without_Padding(
            embed_dim=dim,
            num_heads=num_heads,
@@ -367,8 +372,15 @@ class AscendQwen3_VisionBlock(Qwen3_VisionBlock):
        prefix: str = "",
        use_data_parallel: bool = False,
    ) -> None:
-        super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer,
-                         quant_config, prefix, use_data_parallel)
+        super().__init__(dim=dim,
+                         num_heads=num_heads,
+                         mlp_hidden_dim=mlp_hidden_dim,
+                         act_fn=act_fn,
+                         norm_layer=norm_layer,
+                         quant_config=quant_config,
+                         prefix=prefix,
+                         use_data_parallel=use_data_parallel)
+
        self.attn = AscendQwen2_5_VisionAttention_Without_Padding(
            embed_dim=dim,
            num_heads=num_heads,
--- a/vllm_ascend/models/qwen3_next.py
+++ b/vllm_ascend/models/qwen3_next.py
@@ -16,7 +16,7 @@ from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                              get_tensor_model_parallel_world_size)
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.fla.ops import RMSNormGated, chunk
+from vllm.model_executor.layers.fla.ops import chunk
 from vllm.model_executor.layers.fla.ops.fused_recurrent import \
    fused_recurrent_gated_delta_rule
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 # yapf: disable
 from vllm.model_executor.layers.layernorm import \
    GemmaRMSNorm as Qwen3NextRMSNorm
+from vllm.model_executor.layers.layernorm import RMSNormGated
 # yapf: enable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
@@ -179,6 +180,83 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
            raise ValueError(f"Duplicate layer name: {prefix}")
        compilation_config.static_forward_context[prefix] = self

+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        """
+        Forward pass with three parts:
+        1. Input projection
+        2. Core attention (custom op)
+        3. Output projection
+        """
+        num_tokens = hidden_states.size(0)
+
+        # ============================================================
+        # Part 1: Input Projection
+        # ============================================================
+
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+
+        if attn_metadata is None:
+            # V1 profile run
+            return
+
+        assert isinstance(attn_metadata, dict)
+        attn_metadata = attn_metadata[self.prefix]
+        assert isinstance(attn_metadata, GDNAttentionMetadata)
+
+        num_actual_tokens = (attn_metadata.num_prefill_tokens +
+                             attn_metadata.num_decode_tokens +
+                             attn_metadata.num_spec_decode_tokens)
+
+        # 1. Set up dimensions for reshapes later
+        projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens])
+        projected_states_qkvz, projected_states_ba = torch.split(
+            projected_states,
+            [
+                self.projection_size_qkvz // self.tp_size,
+                self.projection_size_ba // self.tp_size
+            ],
+            dim=-1,
+        )
+        query, key, value, z, b, a = self.fix_query_key_value_ordering(
+            projected_states_qkvz, projected_states_ba)
+        query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'),
+                                (query, key, value))
+        mixed_qkv = torch.cat((query, key, value), dim=-1)
+
+        # ============================================================
+        # Part 2: Core Attention (Custom Op)
+        # ============================================================
+        core_attn_out = torch.zeros(
+            (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        torch.ops.vllm.gdn_attention_core(
+            mixed_qkv,
+            b,
+            a,
+            core_attn_out,
+            self.prefix,
+        )
+
+        # ============================================================
+        # Part 3: Output Projection
+        # ============================================================
+        z_shape_og = z.shape
+        # Reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output[:num_tokens], _ = self.out_proj(core_attn_out)
+
    def _forward(
        self,
        hidden_states: torch.Tensor,
@@ -437,6 +515,248 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):

        output[:num_actual_tokens], _ = self.out_proj(core_attn_out)

+    def _forward_core(
+        self,
+        mixed_qkv: torch.Tensor,
+        b: torch.Tensor,
+        a: torch.Tensor,
+        core_attn_out: torch.Tensor,
+    ):
+        """
+        Core attention computation (called by custom op).
+        """
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+
+        if attn_metadata is None:
+            # V1 profile run
+            return
+
+        assert isinstance(attn_metadata, dict)
+        attn_metadata = attn_metadata[self.prefix]
+        assert isinstance(attn_metadata, GDNAttentionMetadata)
+        has_initial_state = attn_metadata.has_initial_state
+        spec_query_start_loc = attn_metadata.spec_query_start_loc
+        non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
+        spec_sequence_masks = attn_metadata.spec_sequence_masks
+        spec_token_indx = attn_metadata.spec_token_indx
+        non_spec_token_indx = attn_metadata.non_spec_token_indx
+        spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
+        non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+
+        conv_state = self_kv_cache[0].transpose(-1, -2)
+        ssm_state = self_kv_cache[1]
+
+        num_actual_tokens = (attn_metadata.num_prefill_tokens +
+                             attn_metadata.num_decode_tokens +
+                             attn_metadata.num_spec_decode_tokens)
+        num_accepted_tokens = attn_metadata.num_accepted_tokens
+
+        mixed_qkv = mixed_qkv[:num_actual_tokens]
+        b = b[:num_actual_tokens]
+        a = a[:num_actual_tokens]
+
+        # 1. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if spec_sequence_masks is not None:
+            if (attn_metadata.num_prefills == 0
+                    and attn_metadata.num_decodes == 0):
+                mixed_qkv_spec = mixed_qkv
+                mixed_qkv_non_spec = None
+            else:
+                mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
+                mixed_qkv_non_spec = mixed_qkv.index_select(
+                    0, non_spec_token_indx)
+        else:
+            mixed_qkv_spec = None
+            mixed_qkv_non_spec = mixed_qkv
+
+        # 1.1: Process the multi-query part
+        if spec_sequence_masks is not None:
+            mixed_qkv_spec = mixed_qkv_spec.view(
+                attn_metadata.num_spec_decodes, -1, mixed_qkv_spec.size(-1))
+            mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b l d -> b d l')
+            mixed_qkv_spec = causal_conv1d.causal_conv1d_update(
+                mixed_qkv_spec,
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=spec_state_indices_tensor[:, 0]
+                [:attn_metadata.num_spec_decodes],
+                num_accepted_tokens=num_accepted_tokens,
+                validate_data=False,
+            )
+            mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b d l -> (b l) d')
+
+        # 1.2: Process the remaining part
+        if attn_metadata.num_prefills > 0:
+            # - "cache_indices" updates the conv_state cache in positions
+            #   pointed to by "mamba_cache_params.state_indices_tensor"
+            mixed_qkv_non_spec = causal_conv1d.causal_conv1d_fn(
+                mixed_qkv_non_spec.transpose(0, 1),
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=conv_state,
+                has_initial_state=has_initial_state,
+                cache_indices=non_spec_state_indices_tensor,
+                query_start_loc=non_spec_query_start_loc,
+            ).transpose(0, 1)
+        elif attn_metadata.num_decodes > 0:
+            mixed_qkv_non_spec = causal_conv1d.causal_conv1d_update(
+                mixed_qkv_non_spec,
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=non_spec_state_indices_tensor[:attn_metadata
+                                                                 .num_decodes],
+                # validate_data=True,
+            )
+        else:
+            mixed_qkv_non_spec = None
+
+        query_spec, key_spec, value_spec = self.rearrange_mixed_qkv(
+            mixed_qkv_spec)
+        query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv(
+            mixed_qkv_non_spec)
+
+        beta = b.sigmoid()
+        g, beta = fused_gdn_gating(self.A_log, a, b, self.dt_bias)
+
+        if spec_sequence_masks is not None:
+            if (attn_metadata.num_prefills == 0
+                    and attn_metadata.num_decodes == 0):
+                g_spec = g
+                beta_spec = beta
+                g_non_spec = None
+                beta_non_spec = None
+            else:
+                g_spec = g.index_select(1, spec_token_indx)
+                beta_spec = beta.index_select(1, spec_token_indx)
+                g_non_spec = g.index_select(1, non_spec_token_indx)
+                beta_non_spec = beta.index_select(1, non_spec_token_indx)
+        else:
+            g_spec = None
+            beta_spec = None
+            g_non_spec = g
+            beta_non_spec = beta
+
+        # 2. Recurrent attention
+
+        # 2.1: Process the multi-query part
+        if spec_sequence_masks is not None:
+            core_attn_out_spec, last_recurrent_state = (
+                fused_recurrent_gated_delta_rule(
+                    q=query_spec,
+                    k=key_spec,
+                    v=value_spec,
+                    g=g_spec,
+                    beta=beta_spec,
+                    initial_state=ssm_state,
+                    inplace_final_state=True,
+                    cu_seqlens=spec_query_start_loc[:attn_metadata.
+                                                    num_spec_decodes + 1],
+                    ssm_state_indices=spec_state_indices_tensor,
+                    num_accepted_tokens=num_accepted_tokens,
+                    use_qk_l2norm_in_kernel=True,
+                ))
+        else:
+            core_attn_out_spec, last_recurrent_state = None, None
+
+        # 3.2: process the remaining part
+        if attn_metadata.num_prefills > 0:
+            initial_state = ssm_state[
+                non_spec_state_indices_tensor].contiguous()
+            initial_state[~has_initial_state, ...] = 0
+
+            batch_size = initial_state.shape[0]
+            core_attn_out = []
+            last_recurrent_state = []
+
+            for b_idx in range(batch_size):
+                start, end = non_spec_query_start_loc[
+                    b_idx], non_spec_query_start_loc[b_idx + 1]
+                cur_q = query_non_spec[:, start:end, ...]
+                cur_k = key_non_spec[:, start:end, ...]
+                cur_v = value_non_spec[:, start:end, ...]
+                cur_g = g_non_spec[:, start:end, ...]
+                cur_b = beta_non_spec[:, start:end, ...]
+                cur_state = initial_state[b_idx].unsqueeze(0)
+
+                (
+                    cur_core_attn_out_non_spec,
+                    cur_last_recurrent_state,
+                ) = chunk.chunk_gated_delta_rule(
+                    query=cur_q,
+                    key=cur_k,
+                    value=cur_v,
+                    g=cur_g,
+                    beta=cur_b,
+                    initial_state=cur_state,
+                    output_final_state=True,
+                    use_qk_l2norm_in_kernel=True,
+                )
+
+                core_attn_out.append(cur_core_attn_out_non_spec)
+                last_recurrent_state.append(cur_last_recurrent_state)
+
+            tar_dtype = core_attn_out[0].dtype
+            tar_device = core_attn_out[0].device
+            tar_shape = list(core_attn_out[0].shape)
+            tar_shape[1] = non_spec_query_start_loc[-1]
+            core_attn_out_non_spec = torch.empty(tar_shape,
+                                                 dtype=tar_dtype,
+                                                 device=tar_device)
+            for b_idx in range(batch_size):
+                cur_core_attn_out = core_attn_out[b_idx]
+                start, end = non_spec_query_start_loc[
+                    b_idx], non_spec_query_start_loc[b_idx + 1]
+                core_attn_out_non_spec[:, start:end, ...] = cur_core_attn_out
+            last_recurrent_state = torch.cat(last_recurrent_state, dim=0)
+
+            # Init cache
+            ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to(
+                ssm_state.dtype)
+        elif attn_metadata.num_decodes > 0:
+            core_attn_out_non_spec, last_recurrent_state = (
+                fused_recurrent_gated_delta_rule(
+                    q=query_non_spec,
+                    k=key_non_spec,
+                    v=value_non_spec,
+                    g=g_non_spec,
+                    beta=beta_non_spec,
+                    initial_state=ssm_state,
+                    inplace_final_state=True,
+                    cu_seqlens=non_spec_query_start_loc[:attn_metadata.
+                                                        num_decodes + 1],
+                    ssm_state_indices=non_spec_state_indices_tensor,
+                    use_qk_l2norm_in_kernel=True,
+                ))
+        else:
+            core_attn_out_non_spec, last_recurrent_state = None, None
+
+        # 3. Merge core attention output
+        if spec_sequence_masks is not None and core_attn_out_non_spec is not None:
+            merged_out = torch.empty(
+                (1, num_actual_tokens, *core_attn_out_spec.shape[2:]),
+                dtype=core_attn_out_non_spec.dtype,
+                device=core_attn_out_non_spec.device,
+            )
+            merged_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
+            merged_out.index_copy_(1, non_spec_token_indx,
+                                   core_attn_out_non_spec)
+            core_attn_out[:num_actual_tokens] = merged_out.squeeze(0)
+        elif spec_sequence_masks is not None:
+            core_attn_out[:num_actual_tokens] = core_attn_out_spec.squeeze(0)
+        else:
+            core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(
+                0)
+

 class CustomQwen3NextDecoderLayer(Qwen3NextDecoderLayer):