Revert PTA upgrade PR (#3352)

we notice that torch npu 0919 doesn't work. This PR revert related change which rely on 0919 version. Revert PR: #3295 #3205 #3102 Related: #3353 - vLLM version: v0.11.0
2025-10-10 14:09:53 +08:00
parent 601a37aeff
commit ba19dd3183
15 changed files with 57 additions and 312 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -156,14 +156,12 @@ def set_ascend_forward_context(
        # Once the necessary conditions are met, support for MOE models will also be added.
        from vllm_ascend.quantization.quant_config import AscendQuantConfig
        addrmsnorm_quant_fusion_enabled = isinstance(vllm_config.quant_config, AscendQuantConfig) and \
-            vllm_config.model_config.hf_config.model_type in ["llama", "qwen2", "qwen3", "qwen3_moe"] and \
+            vllm_config.model_config.hf_config.model_type in ["llama", "qwen2", "qwen3"] and \
            forward_context.layer_idx is not None
        if addrmsnorm_quant_fusion_enabled:
            forward_context.model_instance = model_instance
            forward_context.num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
            forward_context.fusion_linear = "gate_up_dense" if forward_context.layer_idx == 0 else "qkv_dense"
-            if vllm_config.model_config.hf_config.model_type == "qwen3_moe":
-                forward_context.fusion_linear = "gate_moe" if forward_context.layer_idx == 0 else "qkv_moe"
        forward_context.addrmsnorm_quant_fusion_enabled = addrmsnorm_quant_fusion_enabled

        if num_tokens is None and attn_metadata is not None:
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -34,8 +34,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
                                         maybe_save_kv_layer_to_connector,
                                         wait_for_kv_layer_from_connector)
-from vllm_ascend.compilation.acl_graph import (get_graph_params,
-                                               update_graph_params_workspaces)
+from vllm_ascend.compilation.acl_graph import get_graph_params
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                               nd_to_nz_2d, nd_to_nz_spec)
@@ -394,28 +393,13 @@ class AscendAttentionBackendImpl(AttentionImpl):
            forward_context: ForwardContext = get_forward_context()
            num_tokens = query.shape[0]
            if forward_context.capturing:
-                # Get workspace from cache or calculate it if not present.
-                workspace = graph_params.workspaces.get(num_tokens)
-                if workspace is None:
-                    workspace = torch_npu._npu_paged_attention_get_workspace(
-                        query=query,
-                        key_cache=self.key_cache,
-                        value_cache=self.value_cache,
-                        num_kv_heads=self.num_kv_heads,
-                        num_heads=self.num_heads,
-                        scale_value=self.scale,
-                        block_table=attn_metadata.block_tables,
-                        context_lens=attn_metadata.seq_lens,
-                        out=output)
-                    update_graph_params_workspaces(num_tokens, workspace)
-
-                # Handle graph capturing mode
                stream = torch_npu.npu.current_stream()

                event = torch.npu.ExternalEvent()
                event.wait(stream)
                event.reset(stream)
                graph_params.events[num_tokens].append(event)
+
                graph_params.attn_params[num_tokens].append((
                    query,
                    self.key_cache,
@@ -429,7 +413,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
                ))

                torch.npu.graph_task_group_begin(stream)
-
                torch_npu._npu_paged_attention(
                    query=query,
                    key_cache=self.key_cache,
@@ -439,8 +422,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
                    scale_value=self.scale,
                    block_table=attn_metadata.block_tables,
                    context_lens=attn_metadata.seq_lens,
-                    out=output,
-                    workspace=workspace)
+                    out=output)
                handle = torch.npu.graph_task_group_end(stream)
                graph_params.handles[num_tokens].append(handle)
            else:
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -215,17 +215,15 @@ def update_attn_params(update_stream, forward_context, runtime_shape):

        with torch.npu.stream(update_stream):
            torch.npu.graph_task_update_begin(update_stream, handle)
-            torch_npu._npu_paged_attention(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                num_kv_heads=num_kv_heads,
-                num_heads=num_heads,
-                scale_value=scale,
-                block_table=block_table,
-                context_lens=seq_lens,
-                out=output,
-                workspace=graph_params.workspaces.get(runtime_shape))
+            torch_npu._npu_paged_attention(query=query,
+                                           key_cache=key_cache,
+                                           value_cache=value_cache,
+                                           num_kv_heads=num_kv_heads,
+                                           num_heads=num_heads,
+                                           scale_value=scale,
+                                           block_table=block_table,
+                                           context_lens=seq_lens,
+                                           out=output)
            torch.npu.graph_task_update_end(update_stream)

            event.record(update_stream)
@@ -258,11 +256,5 @@ def set_graph_params(aclgraph_capture_sizes: set[int]):
    )


-def update_graph_params_workspaces(num_tokens: int, workspace: int):
-    global _graph_params
-    if _graph_params is not None:
-        _graph_params.workspaces[num_tokens] = workspace
-
-
 def get_graph_params():
    return _graph_params
--- a/vllm_ascend/ops/layernorm.py
+++ b/vllm_ascend/ops/layernorm.py
@@ -15,10 +15,9 @@
 # This file is a part of the vllm-ascend project.
 #

-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, cast

 import torch
-from vllm.config import get_current_vllm_config
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm

@@ -28,7 +27,6 @@ def _addrmsnorm_forward_oot(
    x: torch.Tensor,
    residual: torch.Tensor,
    layer: Optional[torch.nn.Module] = None,
-    bias: Optional[torch.nn.Parameter] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
    import torch_npu

@@ -41,7 +39,6 @@ def _addrmsnorm_forward_oot(
            self.weight,
            layer.aclnn_input_scale,
            layer.aclnn_input_offset,
-            beta=bias,
            epsilon=self.variance_epsilon)
    else:
        if is_310p():
@@ -53,31 +50,12 @@ def _addrmsnorm_forward_oot(
        else:
            x, _, residual = torch_npu.npu_add_rms_norm(
                x, residual, self.weight, self.variance_epsilon)
-        if bias is not None:
-            x.add_(bias)
    torch.ops.vllm.maybe_wait_prefetch_done(x)
    return x, residual


 class AscendRMSNorm(RMSNorm):

-    def __init__(
-        self,
-        hidden_size: int,
-        eps: float = 1e-6,
-        var_hidden_size: Optional[int] = None,
-        has_weight: bool = True,
-        dtype: Optional[torch.dtype] = None,
-    ) -> None:
-        super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
-        vllm_config = get_current_vllm_config()
-        self.bias = None
-        # quantization with anti_method m4 will generate none-zero norm bias
-        if vllm_config is not None and vllm_config.quant_config is not None and \
-                any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()):
-            self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
-                                           requires_grad=False)
-
    def forward_oot(
        self,
        x: torch.Tensor,
@@ -89,13 +67,10 @@ class AscendRMSNorm(RMSNorm):
            residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
            assert x.size(0) == residual.size(0)
            x, residual = _addrmsnorm_forward_oot(
-                self, x, residual, self.next_need_quant_fusion_linear,
-                self.bias)
+                self, x, residual, self.next_need_quant_fusion_linear)
            return x, residual
        x, residual = torch_npu.npu_rms_norm(x, self.weight,
                                             self.variance_epsilon)
-        if self.bias is not None:
-            x.add_(self.bias)
        return x

    @property
@@ -125,13 +100,6 @@ class AscendRMSNorm(RMSNorm):
            # does not need to be repeated
            if not forward_context.prefetch_mlp_enabled:
                forward_context.layer_idx += 1
-        elif fusion_linear == "qkv_moe":
-            next_linear = model_instance.model.layers[
-                layer_idx].self_attn.qkv_proj
-            forward_context.fusion_linear = "gate_moe"
-        elif fusion_linear == "gate_moe":
-            forward_context.fusion_linear = "qkv_moe"
-            forward_context.layer_idx += 1
        from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
        if next_linear is not None and \
            not isinstance(next_linear.quant_method.quant_method, AscendW8A8LinearMethod):
@@ -139,6 +107,31 @@ class AscendRMSNorm(RMSNorm):
        return next_linear


+class AscendQuantRMSNorm(AscendRMSNorm):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        var_hidden_size: Optional[int] = None,
+        has_weight: bool = True,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
+        self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
+                                       requires_grad=False)
+
+    def forward_oot(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            x, residual = super().forward_oot(x, residual)
+            return x.add_(self.bias), residual
+        return cast(torch.Tensor, super().forward_oot(x)).add_(self.bias)
+
+
 class AscendGemmaRMSNorm(GemmaRMSNorm):

    def forward_oot(
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -501,7 +501,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
    from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
    from vllm_ascend.ops.common_fused_moe import (AscendFusedMoE,
                                                  AscendSharedFusedMoE)
-    from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm
+    from vllm_ascend.ops.layernorm import (AscendGemmaRMSNorm,
+                                           AscendQuantRMSNorm, AscendRMSNorm)
    from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
                                        AscendMergedColumnParallelLinear,
                                        AscendQKVParallelLinear,
@@ -532,6 +533,11 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
        "MultiHeadLatentAttention": AscendMultiHeadLatentAttention,
    }

+    if vllm_config is not None and \
+        vllm_config.quant_config is not None and \
+        any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()):
+        REGISTERED_ASCEND_OPS["RMSNorm"] = AscendQuantRMSNorm
+
    for name, op_cls in REGISTERED_ASCEND_OPS.items():
        CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)