[refactor] Refactoring AscendFusedMoE (#1229)

### What this PR does / why we need it? This PR is used for resolved [issue 1147](https://github.com/vllm-project/vllm-ascend/issues/1147) 1. Move fused_moe code into one file `fused_moe.py`. 2. Integrate branch conditions into function `get_fused_moe_state`.  ### Does this PR introduce _any_ user-facing change? 1. This PR has removed the env `VLLM_ENABLE_MC2`, because I think this env is useless, we can make judgments based on the current scenario without this env, it will only increase complexity. 2. This PR has removed the env `USING_LCCL_COM`, because this env has already expired. 3. `additional_config.expert_tensor_parallel_size` has already expired, and now we also use parameter `enable_expert_parallel`, consistent with the vLLM.  ### How was this patch tested?  Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-06-17 17:49:03 +08:00
parent 05dec7eda9
commit 23ca68d0c8
9 changed files with 150 additions and 204 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -348,15 +348,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            self.init_torchair_graph_batch_sizes()

        if len(self.torchair_graph_batch_sizes) == 0:
-            #If MC2 is enabled, torchair_graph_batch_size should pad to tp_size
-            if envs_ascend.VLLM_ENABLE_MC2:
-                self.torchair_graph_batch_sizes = [
-                    self.scheduler_config.max_num_seqs
-                ]
-            else:
-                self.torchair_graph_batch_sizes = [
-                    1, self.scheduler_config.max_num_seqs
-                ]
+            # TODO(zzzzwwjj): check torchair_graph_batch_sizes init code
+            self.torchair_graph_batch_sizes = [
+                self.scheduler_config.max_num_seqs
+            ]

        torch._dynamo.cache_size.config.cache_size_limit += len(
            self.torchair_graph_batch_sizes)
@@ -569,10 +564,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            self.input_batch.refresh_sampling_metadata()

    def _get_forward_metadata_across_dp(
-            self, batch_size: int, with_prefill: bool) -> tuple[int, bool]:
-        forward_metadata = torch.tensor([batch_size, with_prefill],
-                                        device="cpu",
-                                        dtype=torch.int32)
+            self, total_num_scheduled_tokens: int,
+            with_prefill: bool) -> tuple[int, bool]:
+        forward_metadata = torch.tensor(
+            [total_num_scheduled_tokens, with_prefill],
+            device="cpu",
+            dtype=torch.int32)
        dist.all_reduce(forward_metadata,
                        op=ReduceOp.MAX,
                        group=get_dp_group().cpu_group)
@@ -901,11 +898,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        if self.dp_size > 1:
            max_num_tokens, with_prefill = self._get_forward_metadata_across_dp(
                total_num_scheduled_tokens, with_prefill)
+            extra_builder_kwargs['max_num_tokens_across_dp'] = max_num_tokens
            extra_builder_kwargs['with_prefill_across_dp'] = with_prefill

        # Add graph_pad_size here
-        if envs_ascend.VLLM_ENABLE_MC2 or (self.torchair_graph_enabled
-                                           and not with_prefill):
+        if self.torchair_graph_enabled and not with_prefill:
            if self.dp_size > 1:
                padded_batch_size = self.select_torchair_padded_batch_size(
                    max_num_tokens)
@@ -984,8 +981,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        else:
            positions = self.positions[:num_input_tokens]

-        if (envs_ascend.VLLM_ENABLE_MC2
-                or self.torchair_graph_enabled) and not with_prefill:
+        if self.torchair_graph_enabled and not with_prefill:
            input_ids = self.input_ids[:padded_batch_size]
            positions = self.positions[:padded_batch_size]

@@ -1885,20 +1881,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        return spec_token_ids

    def init_torchair_graph_batch_sizes(self):
+        start_graph_batch_size = 4
        tp_size = get_tensor_model_parallel_world_size()
-        batch_size_step = 8
-        largest_batch_size = 1

-        if envs_ascend.VLLM_ENABLE_MC2:
-            batch_size_step = max(batch_size_step, tp_size)
-            largest_batch_size = batch_size_step
-        while (largest_batch_size < 8):
-            self.torchair_graph_batch_sizes.append(largest_batch_size)
-            largest_batch_size *= 2
+        # NOTE: When use all2all | mc2, We need to slice the `num_tokens` dimension into `tp_size` blocks
+        start_graph_batch_size = max(start_graph_batch_size, tp_size)

-        while (largest_batch_size <= self.scheduler_config.max_num_seqs):
-            self.torchair_graph_batch_sizes.append(largest_batch_size)
-            largest_batch_size += batch_size_step
+        while (start_graph_batch_size <= self.scheduler_config.max_num_seqs):
+            self.torchair_graph_batch_sizes.append(start_graph_batch_size)
+            start_graph_batch_size *= 2

    def select_torchair_padded_batch_size(self, batch_size: int):
        selected_batch_size = self.max_num_reqs
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -38,7 +38,6 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.worker_base import WorkerBase

-import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import init_ascend_config
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
@@ -247,15 +246,15 @@ class NPUWorker(WorkerBase):

    def execute_dummy_batch(self) -> None:
        runner = self.model_runner
-        num_tokens = 1
+        max_num_tokens = 1
+        with_prefill = False
        if runner.dp_size > 1:
            max_num_tokens, with_prefill = runner._get_forward_metadata_across_dp(
-                1, False)
-        if envs_ascend.VLLM_ENABLE_MC2 or runner.torchair_graph_enabled:
-            if not with_prefill:
-                num_tokens = max_num_tokens
-            num_tokens = runner.select_torchair_padded_batch_size(num_tokens)
-        runner._dummy_run(num_tokens,
+                max_num_tokens, with_prefill)
+        if runner.torchair_graph_enabled and not with_prefill:
+            max_num_tokens = runner.select_torchair_padded_batch_size(
+                max_num_tokens)
+        runner._dummy_run(max_num_tokens,
                          is_compile=False,
                          with_prefill=with_prefill)