[refactor] refactor deepseek-related files (#2849)
### What this PR does / why we need it?
This PR deletes ~2K lines of code about deepseek modeling. It falls back
CustomDeepseekV2 modules to original vllm implementations and adapts
some modifications in vllm about deepseek and moe.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
E2E vllm serving with torchair graph mode and eager mode.
- vLLM version: v0.10.2
- vLLM main:
759ef49b15
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: yiz-liu <136800916+yiz-liu@users.noreply.github.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -49,11 +49,17 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
class NPUTorchairModelRunner(NPUModelRunner):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
||||
ascend_config = get_ascend_config()
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
super().__init__(vllm_config, device)
|
||||
self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
|
||||
None, None, vllm_config, device)
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
register_torchair_model()
|
||||
torchair_ops_patch()
|
||||
torchair_quant_method_register()
|
||||
if self.enable_shared_expert_dp:
|
||||
return
|
||||
self.new_kv_cache_bytes = -1
|
||||
self.torchair_compiled_model = None # type: ignore
|
||||
self.torchair_compiled_models = {} # type: ignore
|
||||
@@ -72,14 +78,14 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES)
|
||||
|
||||
self._check_batch_sizes_consistency()
|
||||
register_torchair_model()
|
||||
torchair_ops_patch()
|
||||
torchair_quant_method_register()
|
||||
|
||||
def _sync_metadata_across_dp(
|
||||
self, num_tokens: int, with_prefill: bool, enable_dbo: bool
|
||||
) -> tuple[int, Optional[torch.Tensor], bool, bool]:
|
||||
"""Override from NPUModelRunner to pad num_tokens"""
|
||||
if self.enable_shared_expert_dp:
|
||||
return super()._sync_metadata_across_dp(num_tokens, with_prefill,
|
||||
enable_dbo)
|
||||
if self.dp_size == 1:
|
||||
if not with_prefill:
|
||||
maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
|
||||
@@ -115,7 +121,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
|
||||
# NOTE: If torchair graph mode and not with_prefill,
|
||||
# we can't skip_attn, it will cause graph recompile.
|
||||
if not with_prefill:
|
||||
if with_prefill or self.enable_shared_expert_dp:
|
||||
attn_metadata = super()._build_attention_metadata(
|
||||
with_prefill, num_reqs, skip_attn)
|
||||
else:
|
||||
common_attn_metadata = TorchairCommonAttentionMetadata(
|
||||
num_reqs=num_reqs,
|
||||
num_actual_tokens=1,
|
||||
@@ -126,17 +135,19 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
)
|
||||
attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
|
||||
common_attn_metadata)
|
||||
else:
|
||||
attn_metadata = super()._build_attention_metadata(
|
||||
with_prefill, num_reqs, skip_attn)
|
||||
return attn_metadata
|
||||
|
||||
def _generate_dummy_run_hidden_states(self, with_prefill,
|
||||
is_torchair_compile, input_ids,
|
||||
positions, attn_metadata, num_tokens,
|
||||
intermediate_tensors, inputs_embeds):
|
||||
|
||||
if not with_prefill:
|
||||
if with_prefill or self.enable_shared_expert_dp:
|
||||
if is_310p():
|
||||
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
||||
hidden_states = super()._generate_dummy_run_hidden_states(
|
||||
with_prefill, is_torchair_compile, input_ids, positions,
|
||||
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
|
||||
else:
|
||||
# Only mark static while compiling
|
||||
if is_torchair_compile:
|
||||
torch._dynamo.mark_static(input_ids)
|
||||
@@ -168,15 +179,11 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
inputs_embeds=None,
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
if is_310p():
|
||||
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
||||
hidden_states = super()._generate_dummy_run_hidden_states(
|
||||
with_prefill, is_torchair_compile, input_ids, positions,
|
||||
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
|
||||
return hidden_states
|
||||
|
||||
def _convert_torch_format(self, kv_cache):
|
||||
if self.enable_shared_expert_dp:
|
||||
return super()._convert_torch_format(kv_cache)
|
||||
kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND)
|
||||
return kv_cache
|
||||
|
||||
@@ -194,6 +201,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
|
||||
def _capture_model(self):
|
||||
"""Override from NPUModelRunner to use torchair graph capture."""
|
||||
if self.enable_shared_expert_dp:
|
||||
return super()._capture_model()
|
||||
# TODO(NeverRaR): Calling graph_capture(device=self.device) in
|
||||
# torchair graph capture can cause some issues, so now we just
|
||||
# temporarily split the codepath for the two different graph patterns.
|
||||
@@ -233,6 +242,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
self.new_kv_cache_bytes)
|
||||
|
||||
def _use_aclgraph(self) -> bool:
|
||||
if self.enable_shared_expert_dp:
|
||||
return super()._use_aclgraph()
|
||||
return False
|
||||
|
||||
def _check_batch_sizes_consistency(self) -> None:
|
||||
@@ -258,10 +269,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
)
|
||||
|
||||
def _update_graph_pad_size(self, with_prefill, graph_pad_size):
|
||||
if not with_prefill:
|
||||
self.graph_pad_size = graph_pad_size
|
||||
else:
|
||||
if with_prefill or self.enable_shared_expert_dp:
|
||||
super()._update_graph_pad_size(with_prefill, graph_pad_size)
|
||||
else:
|
||||
self.graph_pad_size = graph_pad_size
|
||||
|
||||
def _update_input_ids_and_positions(self, input_ids, positions,
|
||||
num_input_tokens, with_prefill,
|
||||
@@ -271,7 +282,9 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
input_ids, positions, num_input_tokens, with_prefill,
|
||||
padded_num_tokens_across_dp)
|
||||
|
||||
if not with_prefill:
|
||||
if with_prefill or self.enable_shared_expert_dp:
|
||||
return input_ids, positions
|
||||
else:
|
||||
input_ids = self.input_ids[:padded_num_tokens_across_dp]
|
||||
positions = self.positions[:padded_num_tokens_across_dp]
|
||||
return input_ids, positions
|
||||
@@ -284,6 +297,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
if attn_metadata is not None and isinstance(attn_metadata, dict):
|
||||
attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
|
||||
|
||||
if self.enable_shared_expert_dp:
|
||||
return super()._generate_process_reqs_hidden_states(
|
||||
attn_metadata, with_prefill, padded_num_tokens_across_dp,
|
||||
input_ids, positions, intermediate_tensors, inputs_embeds)
|
||||
model_kwargs = {
|
||||
"kv_caches": self.kv_caches,
|
||||
"attn_metadata": attn_metadata
|
||||
@@ -468,8 +485,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
self.torchair_graph_batch_sizes = new_graph_batch_sizes
|
||||
|
||||
def _build_drafter_prepare_inputs_torchair_param(self):
|
||||
return True
|
||||
|
||||
def get_dp_padding(self, num_tokens):
|
||||
"""Override from NPUModelRunner to get dp padding"""
|
||||
return 0, None
|
||||
if self.enable_shared_expert_dp:
|
||||
return super()._build_drafter_prepare_inputs_torchair_param()
|
||||
else:
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user