[refactor] refactor deepseek-related files (#2849)

### What this PR does / why we need it?
This PR deletes ~2K lines of code about deepseek modeling. It falls back
CustomDeepseekV2 modules to original vllm implementations and adapts
some modifications in vllm about deepseek and moe.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
E2E  vllm serving with torchair graph mode and eager mode.

- vLLM version: v0.10.2
- vLLM main:
759ef49b15

---------

Signed-off-by: linfeng-yuan <1102311262@qq.com>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: yiz-liu <136800916+yiz-liu@users.noreply.github.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
linfeng-yuan
2025-09-16 14:13:07 +08:00
committed by GitHub
parent 18ca7861f6
commit 1c5900327b
18 changed files with 295 additions and 1899 deletions

View File

@@ -49,11 +49,17 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
class NPUTorchairModelRunner(NPUModelRunner):
def __init__(self, vllm_config: VllmConfig, device: torch.device):
ascend_config = get_ascend_config()
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
super().__init__(vllm_config, device)
self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
None, None, vllm_config, device)
ascend_config = get_ascend_config()
register_torchair_model()
torchair_ops_patch()
torchair_quant_method_register()
if self.enable_shared_expert_dp:
return
self.new_kv_cache_bytes = -1
self.torchair_compiled_model = None # type: ignore
self.torchair_compiled_models = {} # type: ignore
@@ -72,14 +78,14 @@ class NPUTorchairModelRunner(NPUModelRunner):
recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES)
self._check_batch_sizes_consistency()
register_torchair_model()
torchair_ops_patch()
torchair_quant_method_register()
def _sync_metadata_across_dp(
self, num_tokens: int, with_prefill: bool, enable_dbo: bool
) -> tuple[int, Optional[torch.Tensor], bool, bool]:
"""Override from NPUModelRunner to pad num_tokens"""
if self.enable_shared_expert_dp:
return super()._sync_metadata_across_dp(num_tokens, with_prefill,
enable_dbo)
if self.dp_size == 1:
if not with_prefill:
maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
@@ -115,7 +121,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
# NOTE: If torchair graph mode and not with_prefill,
# we can't skip_attn, it will cause graph recompile.
if not with_prefill:
if with_prefill or self.enable_shared_expert_dp:
attn_metadata = super()._build_attention_metadata(
with_prefill, num_reqs, skip_attn)
else:
common_attn_metadata = TorchairCommonAttentionMetadata(
num_reqs=num_reqs,
num_actual_tokens=1,
@@ -126,17 +135,19 @@ class NPUTorchairModelRunner(NPUModelRunner):
)
attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
common_attn_metadata)
else:
attn_metadata = super()._build_attention_metadata(
with_prefill, num_reqs, skip_attn)
return attn_metadata
def _generate_dummy_run_hidden_states(self, with_prefill,
is_torchair_compile, input_ids,
positions, attn_metadata, num_tokens,
intermediate_tensors, inputs_embeds):
if not with_prefill:
if with_prefill or self.enable_shared_expert_dp:
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = super()._generate_dummy_run_hidden_states(
with_prefill, is_torchair_compile, input_ids, positions,
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
else:
# Only mark static while compiling
if is_torchair_compile:
torch._dynamo.mark_static(input_ids)
@@ -168,15 +179,11 @@ class NPUTorchairModelRunner(NPUModelRunner):
inputs_embeds=None,
**model_kwargs,
)
else:
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = super()._generate_dummy_run_hidden_states(
with_prefill, is_torchair_compile, input_ids, positions,
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
return hidden_states
def _convert_torch_format(self, kv_cache):
if self.enable_shared_expert_dp:
return super()._convert_torch_format(kv_cache)
kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND)
return kv_cache
@@ -194,6 +201,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
def _capture_model(self):
"""Override from NPUModelRunner to use torchair graph capture."""
if self.enable_shared_expert_dp:
return super()._capture_model()
# TODO(NeverRaR): Calling graph_capture(device=self.device) in
# torchair graph capture can cause some issues, so now we just
# temporarily split the codepath for the two different graph patterns.
@@ -233,6 +242,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
self.new_kv_cache_bytes)
def _use_aclgraph(self) -> bool:
if self.enable_shared_expert_dp:
return super()._use_aclgraph()
return False
def _check_batch_sizes_consistency(self) -> None:
@@ -258,10 +269,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
)
def _update_graph_pad_size(self, with_prefill, graph_pad_size):
if not with_prefill:
self.graph_pad_size = graph_pad_size
else:
if with_prefill or self.enable_shared_expert_dp:
super()._update_graph_pad_size(with_prefill, graph_pad_size)
else:
self.graph_pad_size = graph_pad_size
def _update_input_ids_and_positions(self, input_ids, positions,
num_input_tokens, with_prefill,
@@ -271,7 +282,9 @@ class NPUTorchairModelRunner(NPUModelRunner):
input_ids, positions, num_input_tokens, with_prefill,
padded_num_tokens_across_dp)
if not with_prefill:
if with_prefill or self.enable_shared_expert_dp:
return input_ids, positions
else:
input_ids = self.input_ids[:padded_num_tokens_across_dp]
positions = self.positions[:padded_num_tokens_across_dp]
return input_ids, positions
@@ -284,6 +297,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
if attn_metadata is not None and isinstance(attn_metadata, dict):
attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
if self.enable_shared_expert_dp:
return super()._generate_process_reqs_hidden_states(
attn_metadata, with_prefill, padded_num_tokens_across_dp,
input_ids, positions, intermediate_tensors, inputs_embeds)
model_kwargs = {
"kv_caches": self.kv_caches,
"attn_metadata": attn_metadata
@@ -468,8 +485,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
self.torchair_graph_batch_sizes = new_graph_batch_sizes
def _build_drafter_prepare_inputs_torchair_param(self):
return True
def get_dp_padding(self, num_tokens):
"""Override from NPUModelRunner to get dp padding"""
return 0, None
if self.enable_shared_expert_dp:
return super()._build_drafter_prepare_inputs_torchair_param()
else:
return True