[refactor] refactor deepseek-related files (#2849)

### What this PR does / why we need it? This PR deletes ~2K lines of code about deepseek modeling. It falls back CustomDeepseekV2 modules to original vllm implementations and adapts some modifications in vllm about deepseek and moe. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? E2E vllm serving with torchair graph mode and eager mode. - vLLM version: v0.10.2 - vLLM main: 759ef49b15 --------- Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-09-16 14:13:07 +08:00
parent 18ca7861f6
commit 1c5900327b
18 changed files with 295 additions and 1899 deletions
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -49,11 +49,17 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 class NPUTorchairModelRunner(NPUModelRunner):

    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        ascend_config = get_ascend_config()
+        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
        super().__init__(vllm_config, device)
        self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
            None, None, vllm_config, device)

-        ascend_config = get_ascend_config()
+        register_torchair_model()
+        torchair_ops_patch()
+        torchair_quant_method_register()
+        if self.enable_shared_expert_dp:
+            return
        self.new_kv_cache_bytes = -1
        self.torchair_compiled_model = None  # type: ignore
        self.torchair_compiled_models = {}  # type: ignore
@@ -72,14 +78,14 @@ class NPUTorchairModelRunner(NPUModelRunner):
            recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES)

        self._check_batch_sizes_consistency()
-        register_torchair_model()
-        torchair_ops_patch()
-        torchair_quant_method_register()

    def _sync_metadata_across_dp(
            self, num_tokens: int, with_prefill: bool, enable_dbo: bool
    ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
        """Override from NPUModelRunner to pad num_tokens"""
+        if self.enable_shared_expert_dp:
+            return super()._sync_metadata_across_dp(num_tokens, with_prefill,
+                                                    enable_dbo)
        if self.dp_size == 1:
            if not with_prefill:
                maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
@@ -115,7 +121,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
    def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
        # NOTE: If torchair graph mode and not with_prefill,
        # we can't skip_attn, it will cause graph recompile.
-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            attn_metadata = super()._build_attention_metadata(
+                with_prefill, num_reqs, skip_attn)
+        else:
            common_attn_metadata = TorchairCommonAttentionMetadata(
                num_reqs=num_reqs,
                num_actual_tokens=1,
@@ -126,17 +135,19 @@ class NPUTorchairModelRunner(NPUModelRunner):
            )
            attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
                common_attn_metadata)
-        else:
-            attn_metadata = super()._build_attention_metadata(
-                with_prefill, num_reqs, skip_attn)
        return attn_metadata

    def _generate_dummy_run_hidden_states(self, with_prefill,
                                          is_torchair_compile, input_ids,
                                          positions, attn_metadata, num_tokens,
                                          intermediate_tensors, inputs_embeds):
-
-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
+            hidden_states = super()._generate_dummy_run_hidden_states(
+                with_prefill, is_torchair_compile, input_ids, positions,
+                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
+        else:
            # Only mark static while compiling
            if is_torchair_compile:
                torch._dynamo.mark_static(input_ids)
@@ -168,15 +179,11 @@ class NPUTorchairModelRunner(NPUModelRunner):
                inputs_embeds=None,
                **model_kwargs,
            )
-        else:
-            if is_310p():
-                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
-            hidden_states = super()._generate_dummy_run_hidden_states(
-                with_prefill, is_torchair_compile, input_ids, positions,
-                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
        return hidden_states

    def _convert_torch_format(self, kv_cache):
+        if self.enable_shared_expert_dp:
+            return super()._convert_torch_format(kv_cache)
        kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND)
        return kv_cache

@@ -194,6 +201,8 @@ class NPUTorchairModelRunner(NPUModelRunner):

    def _capture_model(self):
        """Override from NPUModelRunner to use torchair graph capture."""
+        if self.enable_shared_expert_dp:
+            return super()._capture_model()
        # TODO(NeverRaR): Calling graph_capture(device=self.device) in
        # torchair graph capture can cause some issues, so now we just
        # temporarily split the codepath for the two different graph patterns.
@@ -233,6 +242,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
                                         self.new_kv_cache_bytes)

    def _use_aclgraph(self) -> bool:
+        if self.enable_shared_expert_dp:
+            return super()._use_aclgraph()
        return False

    def _check_batch_sizes_consistency(self) -> None:
@@ -258,10 +269,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
            )

    def _update_graph_pad_size(self, with_prefill, graph_pad_size):
-        if not with_prefill:
-            self.graph_pad_size = graph_pad_size
-        else:
+        if with_prefill or self.enable_shared_expert_dp:
            super()._update_graph_pad_size(with_prefill, graph_pad_size)
+        else:
+            self.graph_pad_size = graph_pad_size

    def _update_input_ids_and_positions(self, input_ids, positions,
                                        num_input_tokens, with_prefill,
@@ -271,7 +282,9 @@ class NPUTorchairModelRunner(NPUModelRunner):
            input_ids, positions, num_input_tokens, with_prefill,
            padded_num_tokens_across_dp)

-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            return input_ids, positions
+        else:
            input_ids = self.input_ids[:padded_num_tokens_across_dp]
            positions = self.positions[:padded_num_tokens_across_dp]
        return input_ids, positions
@@ -284,6 +297,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
        if attn_metadata is not None and isinstance(attn_metadata, dict):
            attn_metadata = attn_metadata['model.layers.0.self_attn.attn']

+        if self.enable_shared_expert_dp:
+            return super()._generate_process_reqs_hidden_states(
+                attn_metadata, with_prefill, padded_num_tokens_across_dp,
+                input_ids, positions, intermediate_tensors, inputs_embeds)
        model_kwargs = {
            "kv_caches": self.kv_caches,
            "attn_metadata": attn_metadata
@@ -468,8 +485,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
            self.torchair_graph_batch_sizes = new_graph_batch_sizes

    def _build_drafter_prepare_inputs_torchair_param(self):
-        return True
-
-    def get_dp_padding(self, num_tokens):
-        """Override from NPUModelRunner to get dp padding"""
-        return 0, None
+        if self.enable_shared_expert_dp:
+            return super()._build_drafter_prepare_inputs_torchair_param()
+        else:
+            return True