[Feat] Support routing replay (#6696)

### What this PR does / why we need it?

[Feat] Support routing replay
same as https://github.com/vllm-project/vllm-ascend/pull/6666
resubmit  because of DOC failure

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: liyongwen <1310439159@qq.com>
Signed-off-by: Li-Yongwen <63399187+Li-Yongwen@users.noreply.github.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
Li-Yongwen
2026-02-26 10:22:47 +08:00
committed by GitHub
parent a9cca0c5c4
commit 2870f7c8ad
7 changed files with 190 additions and 0 deletions

View File

@@ -129,6 +129,7 @@ from vllm_ascend.ascend_forward_context import ( # isort: skip
set_mc2_mask,
set_mc2_tokens_capacity,
)
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped]
@@ -373,6 +374,7 @@ class NPUModelRunner(GPUModelRunner):
self.intermediate_tensors: IntermediateTensors | None = None
self.reorder_batch_threshold: int | None = None
self.long_seq_metadata = None
self.cpu_slot_mapping = None
@property
def use_cp(self) -> bool:
@@ -1050,6 +1052,12 @@ class NPUModelRunner(GPUModelRunner):
scheduler_output: "SchedulerOutput",
intermediate_tensors: IntermediateTensors | None = None,
) -> ModelRunnerOutput | IntermediateTensors | None:
if self.vllm_config.model_config.enable_return_routed_experts:
capturer = RoutedExpertsCapturer.get_instance()
if capturer is not None:
capturer.clear_buffer()
else:
logger.warning("RoutedExpertsCapturer is not initialized.")
if self.execute_model_state is not None:
raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.")
# self._draft_token_ids is None when `input_fits_in_drafter=False`
@@ -1428,6 +1436,14 @@ class NPUModelRunner(GPUModelRunner):
if has_kv_transfer_group():
get_kv_transfer_group().clear_connector_metadata()
if self.model_config.enable_return_routed_experts:
capturer = RoutedExpertsCapturer.get_instance()
if capturer is not None:
capturer.save_captured_experts(indices=self.cpu_slot_mapping)
else:
logger.warning("RoutedExpertsCapturer is not initialized.")
model_runner_output = ModelRunnerOutput(
req_ids=req_ids_output_copy,
req_id_to_index=req_id_to_index_output_copy,
@@ -1902,6 +1918,8 @@ class NPUModelRunner(GPUModelRunner):
num_tokens_padded,
slot_mapping,
)
if self.model_config.enable_return_routed_experts and kv_cache_gid == 0:
self.cpu_slot_mapping = slot_mapping.cpu().numpy()
return blk_table_tensor, slot_mapping
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
@@ -2364,6 +2382,9 @@ class NPUModelRunner(GPUModelRunner):
if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches)
if self.model_config.enable_return_routed_experts:
self.init_routed_experts_capturer()
def _align_memory(self, tensor: torch.Tensor, alignment: int) -> torch.Tensor:
data_ptr = tensor.data_ptr()
aligned_addr = (data_ptr + alignment - 1) // alignment * alignment