[refactor] Refactoring AscendFusedMoE (#1229)

### What this PR does / why we need it? This PR is used for resolved [issue 1147](https://github.com/vllm-project/vllm-ascend/issues/1147) 1. Move fused_moe code into one file `fused_moe.py`. 2. Integrate branch conditions into function `get_fused_moe_state`.  ### Does this PR introduce _any_ user-facing change? 1. This PR has removed the env `VLLM_ENABLE_MC2`, because I think this env is useless, we can make judgments based on the current scenario without this env, it will only increase complexity. 2. This PR has removed the env `USING_LCCL_COM`, because this env has already expired. 3. `additional_config.expert_tensor_parallel_size` has already expired, and now we also use parameter `enable_expert_parallel`, consistent with the vLLM.  ### How was this patch tested?  Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-06-17 17:49:03 +08:00
parent 05dec7eda9
commit 23ca68d0c8
9 changed files with 150 additions and 204 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -38,7 +38,6 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.worker_base import WorkerBase

-import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import init_ascend_config
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
@@ -247,15 +246,15 @@ class NPUWorker(WorkerBase):

    def execute_dummy_batch(self) -> None:
        runner = self.model_runner
-        num_tokens = 1
+        max_num_tokens = 1
+        with_prefill = False
        if runner.dp_size > 1:
            max_num_tokens, with_prefill = runner._get_forward_metadata_across_dp(
-                1, False)
-        if envs_ascend.VLLM_ENABLE_MC2 or runner.torchair_graph_enabled:
-            if not with_prefill:
-                num_tokens = max_num_tokens
-            num_tokens = runner.select_torchair_padded_batch_size(num_tokens)
-        runner._dummy_run(num_tokens,
+                max_num_tokens, with_prefill)
+        if runner.torchair_graph_enabled and not with_prefill:
+            max_num_tokens = runner.select_torchair_padded_batch_size(
+                max_num_tokens)
+        runner._dummy_run(max_num_tokens,
                          is_compile=False,
                          with_prefill=with_prefill)