From a955e5d4046e4e3a55976e88774073b3c56b463b Mon Sep 17 00:00:00 2001
From: Wang Yixuan <88923622+hust17yixuan@users.noreply.github.com>
Date: Thu, 28 Aug 2025 09:10:03 +0800
Subject: [PATCH] [4/N][refactor]delete torchair from quantization (#2535)

### What this PR does / why we need it?
After moved torchair related quantization section into
torchair_quantization, split the torchair from the origin quantization

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
vLLM version: main
vLLM main:
https://github.com/vllm-project/vllm/commit/ab9f2cfd1942f7ddfee658ce86ea96b4789862af


- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/69244e67e6822f1c15816f887659e1ccc18c2632

Signed-off-by: hust17yixuan <303660421@qq.com>
---
 tests/ut/quantization/test_w4a8_dynamic.py |  8 ++---
 vllm_ascend/quantization/w4a8_dynamic.py   | 16 +++-------
 vllm_ascend/quantization/w8a8_dynamic.py   | 34 +++++++---------------
 3 files changed, 16 insertions(+), 42 deletions(-)

diff --git a/tests/ut/quantization/test_w4a8_dynamic.py b/tests/ut/quantization/test_w4a8_dynamic.py
index 7bee119..d7fdf82 100644
--- a/tests/ut/quantization/test_w4a8_dynamic.py
+++ b/tests/ut/quantization/test_w4a8_dynamic.py
@@ -39,14 +39,10 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
 
     @patch('vllm_ascend.quantization.w4a8_dynamic.get_current_vllm_config')
     @patch('vllm_ascend.quantization.w4a8_dynamic.get_ep_group')
-    @patch("vllm_ascend.ascend_config.get_ascend_config")
     @patch('vllm_ascend.quantization.w4a8_dynamic.get_mc2_group')
     @patch('torch.distributed.get_rank', return_value=0)
-    def setUp(self, mock_get_rank, mock_get_mc2_group, mock_get_ascend_config,
-              mock_get_ep_group, get_current_vllm_config):
-        mock_ascend_config = Mock()
-        mock_ascend_config.torchair_graph_config = Mock(enabled=False)
-        mock_get_ascend_config.return_value = mock_ascend_config
+    def setUp(self, mock_get_rank, mock_get_mc2_group, mock_get_ep_group,
+              get_current_vllm_config):
         mock_vllm_config = Mock()
         mock_vllm_config.quant_config = Mock(quant_description={
             "group_size": self.group_size,
diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py
index a724615..329b3eb 100644
--- a/vllm_ascend/quantization/w4a8_dynamic.py
+++ b/vllm_ascend/quantization/w4a8_dynamic.py
@@ -24,13 +24,11 @@ from vllm.config import get_current_vllm_config
 from vllm.distributed import get_ep_group
 from vllm.forward_context import get_forward_context
 
-from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.layers.experts_selector import select_experts
 from vllm_ascend.quantization.w8a8_dynamic import (fused_experts_with_all2all,
                                                    fused_experts_with_mc2)
-from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 
 
 class AscendW4A8DynamicLinearMethod:
@@ -133,9 +131,6 @@ class AscendW4A8DynamicFusedMoEMethod:
 
         self.ep_group = get_ep_group()
 
-        ascend_config = get_ascend_config()
-        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-
         vllm_config = get_current_vllm_config()
         self.group_size = vllm_config.quant_config.quant_description.get(
             "group_size", 256)
@@ -284,12 +279,10 @@ class AscendW4A8DynamicFusedMoEMethod:
         fused_moe_state = get_forward_context().fused_moe_state
         shared_gate_up, shared_dequant_scale = None, None
         if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
-            with npu_stream_switch("moe_secondary", 0):
-                npu_wait_tensor(quantized_x_for_share, router_logits)
-                share_up_out, _ = shared_experts.gate_up_proj(
-                    (quantized_x_for_share, dynamic_scale_for_share))
-                shared_gate_up, shared_dequant_scale = share_up_out[
-                    0], share_up_out[1]
+            share_up_out, _ = shared_experts.gate_up_proj(
+                (quantized_x_for_share, dynamic_scale_for_share))
+            shared_gate_up, shared_dequant_scale = share_up_out[
+                0], share_up_out[1]
 
         # this is a naive implementation for experts load balance so as
         # to avoid accumulating too much tokens on a single rank.
@@ -315,7 +308,6 @@ class AscendW4A8DynamicFusedMoEMethod:
                 log2phy=log2phy,
                 global_redundant_expert_num=global_redundant_expert_num,
                 shared_experts=shared_experts,
-                is_torchair=self.torchair_graph_enabled,
                 quantized_x_for_share=shared_gate_up,
                 dynamic_scale_for_share=shared_dequant_scale,
                 mc2_mask=kwargs.get("mc2_mask", None))
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
index cba090b..1d6a61b 100644
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -24,11 +24,9 @@ from vllm.distributed import GroupCoordinator, get_ep_group
 from vllm.forward_context import get_forward_context
 
 import vllm_ascend.envs as envs_ascend
-from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendSocVersion,
                                dispose_tensor, get_ascend_soc_version)
 
@@ -213,7 +211,6 @@ def fused_experts_with_mc2(
     log2phy: torch.Tensor = None,
     global_redundant_expert_num: int = 0,
     shared_experts: Optional[Any] = None,
-    is_torchair: bool = False,
     quantized_x_for_share: Optional[Any] = None,
     dynamic_scale_for_share: Optional[Any] = None,
     mc2_mask: Optional[torch.Tensor] = None,
@@ -232,8 +229,7 @@ def fused_experts_with_mc2(
     ep_world_size = ep_group.world_size
 
     # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine
-    need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3
-                       or is_torchair)
+    need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3)
 
     # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine
     a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3
@@ -282,11 +278,9 @@ def fused_experts_with_mc2(
         0:5]
 
     if shared_experts is not None:
-        with npu_stream_switch("moe_secondary", 0):
-            npu_wait_tensor(shared_gate_up, expand_x)
-            shared_act_out = shared_experts.act_fn(
-                (shared_gate_up, shared_dequant_scale))
-            shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1]
+        shared_act_out = shared_experts.act_fn(
+            (shared_gate_up, shared_dequant_scale))
+        shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1]
 
     # `expand_x` will be disposed in the `apply_mlp` function
     if w1_scale_bias is None:
@@ -358,10 +352,8 @@ def fused_experts_with_mc2(
     if shared_experts is None:
         return hidden_states
     else:
-        with npu_stream_switch("moe_secondary", 0):
-            npu_wait_tensor(shared_act, down_out_list)
-            shared_output, _ = shared_experts.down_proj(
-                (shared_act, swiglu_out_scale))
+        shared_output, _ = shared_experts.down_proj(
+            (shared_act, swiglu_out_scale))
         return hidden_states, shared_output
 
 
@@ -806,9 +798,6 @@ class AscendW8A8DynamicFusedMoEMethod:
 
         self.ep_group = get_ep_group()
 
-        ascend_config = get_ascend_config()
-        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-
         try:
             device_group = get_mc2_group().device_group
             # TODO: Try local_rank = ep_group.rank_in_group
@@ -904,12 +893,10 @@ class AscendW8A8DynamicFusedMoEMethod:
         fused_moe_state = get_forward_context().fused_moe_state
         shared_gate_up, shared_dequant_scale = None, None
         if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
-            with npu_stream_switch("moe_secondary", 0):
-                npu_wait_tensor(quantized_x_for_share, router_logits)
-                share_up_out, _ = shared_experts.gate_up_proj(
-                    (quantized_x_for_share, dynamic_scale_for_share))
-                shared_gate_up, shared_dequant_scale = share_up_out[
-                    0], share_up_out[1]
+            share_up_out, _ = shared_experts.gate_up_proj(
+                (quantized_x_for_share, dynamic_scale_for_share))
+            shared_gate_up, shared_dequant_scale = share_up_out[
+                0], share_up_out[1]
 
         # this is a naive implementation for experts load balance so as
         # to avoid accumulating too much tokens on a single rank.
@@ -944,7 +931,6 @@ class AscendW8A8DynamicFusedMoEMethod:
                 log2phy=log2phy,
                 global_redundant_expert_num=global_redundant_expert_num,
                 shared_experts=shared_experts,
-                is_torchair=self.torchair_graph_enabled,
                 mc2_mask=kwargs.get("mc2_mask", None),
                 shared_gate_up=shared_gate_up,
                 shared_dequant_scale=shared_dequant_scale)