[Feat] Support routing replay (#6696)

### What this PR does / why we need it?

[Feat] Support routing replay
same as https://github.com/vllm-project/vllm-ascend/pull/6666
resubmit  because of DOC failure

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: liyongwen <1310439159@qq.com>
Signed-off-by: Li-Yongwen <63399187+Li-Yongwen@users.noreply.github.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
Li-Yongwen
2026-02-26 10:22:47 +08:00
committed by GitHub
parent a9cca0c5c4
commit 2870f7c8ad
7 changed files with 190 additions and 0 deletions

View File

@@ -26,6 +26,7 @@ from vllm.forward_context import get_forward_context
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
from vllm_ascend.utils import vllm_version_is
@@ -122,6 +123,13 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
e_score_correction_bias=e_score_correction_bias,
global_num_experts=global_num_experts,
)
if layer.vllm_config.model_config is not None and layer.vllm_config.model_config.enable_return_routed_experts:
capturer = RoutedExpertsCapturer.get_instance()
if capturer is not None:
capturer.capture(
layer_id=layer.layer_id,
topk_ids=topk_ids,
)
if zero_expert_num > 0 and zero_expert_type is not None:
topk_ids, topk_weights, zero_expert_result = zero_experts_compute(

View File

@@ -34,5 +34,6 @@ import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_v2_eagle # noqa
import vllm_ascend.patch.worker.patch_v2_uva # noqa
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa
import vllm_ascend.patch.worker.patch_routed_experts_capturer # noqa
import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa
import vllm_ascend.patch.worker.patch_kimi_k25 # noqa

View File

@@ -0,0 +1,68 @@
import numpy as np
import torch
from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
_BUFFER_PREFIX,
_LOCK_FILE_PREFIX,
RoutedExpertsCapturer,
_create_or_attach_shared_memory,
logger,
)
from vllm.platforms import current_platform
def init_buffer(
self,
max_num_batched_tokens: int,
max_num_kv_tokens: int,
vllm_config: VllmConfig,
) -> None:
"""
Initialize the device buffer and optionally shared memory buffer.
Args:
max_num_batched_tokens: Maximum number of tokens in a batch.
max_num_kv_tokens: Maximum number of KV tokens for shared memory.
vllm_config: vllm configuration containing layer and expert info.
"""
if self._device_buffer is not None:
raise RuntimeError("Device buffer has already been initialized")
hf_config = vllm_config.model_config.hf_text_config
num_layers = hf_config.num_hidden_layers
num_experts_per_tok = hf_config.num_experts_per_tok
# Initialize device buffer
self._device_buffer = torch.zeros(
(max_num_batched_tokens, num_layers, num_experts_per_tok),
dtype=torch.int32,
device=current_platform.device_name,
)
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
if get_tensor_model_parallel_rank() != 0:
return
# Initialize shared memory
shape = (max_num_kv_tokens, num_layers, num_experts_per_tok)
buffer_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize
instance_id = vllm_config.instance_id
self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}_{self.dp_rank}.lock"
shm_name = f"{_BUFFER_PREFIX}_{instance_id}_{self.dp_rank}"
self._shm = _create_or_attach_shared_memory(shm_name, buffer_size, self._lock_file)
self._host_buffer_view = np.ndarray(shape, dtype=np.int32, buffer=self._shm.buf)
self._host_buffer_view.fill(0)
logger.debug(
"Created shared memory buffer '%s' with shape %s",
shm_name,
shape,
)
# Patch for _device_buffer's initialization(device="cuda" -> device=current_platform.device_name).
# TODO Remove this patch when pr(https://github.com/vllm-project/vllm/pull/34336) is merged.
RoutedExpertsCapturer.init_buffer = init_buffer

View File

@@ -129,6 +129,7 @@ from vllm_ascend.ascend_forward_context import ( # isort: skip
set_mc2_mask,
set_mc2_tokens_capacity,
)
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped]
@@ -373,6 +374,7 @@ class NPUModelRunner(GPUModelRunner):
self.intermediate_tensors: IntermediateTensors | None = None
self.reorder_batch_threshold: int | None = None
self.long_seq_metadata = None
self.cpu_slot_mapping = None
@property
def use_cp(self) -> bool:
@@ -1050,6 +1052,12 @@ class NPUModelRunner(GPUModelRunner):
scheduler_output: "SchedulerOutput",
intermediate_tensors: IntermediateTensors | None = None,
) -> ModelRunnerOutput | IntermediateTensors | None:
if self.vllm_config.model_config.enable_return_routed_experts:
capturer = RoutedExpertsCapturer.get_instance()
if capturer is not None:
capturer.clear_buffer()
else:
logger.warning("RoutedExpertsCapturer is not initialized.")
if self.execute_model_state is not None:
raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.")
# self._draft_token_ids is None when `input_fits_in_drafter=False`
@@ -1428,6 +1436,14 @@ class NPUModelRunner(GPUModelRunner):
if has_kv_transfer_group():
get_kv_transfer_group().clear_connector_metadata()
if self.model_config.enable_return_routed_experts:
capturer = RoutedExpertsCapturer.get_instance()
if capturer is not None:
capturer.save_captured_experts(indices=self.cpu_slot_mapping)
else:
logger.warning("RoutedExpertsCapturer is not initialized.")
model_runner_output = ModelRunnerOutput(
req_ids=req_ids_output_copy,
req_id_to_index=req_id_to_index_output_copy,
@@ -1902,6 +1918,8 @@ class NPUModelRunner(GPUModelRunner):
num_tokens_padded,
slot_mapping,
)
if self.model_config.enable_return_routed_experts and kv_cache_gid == 0:
self.cpu_slot_mapping = slot_mapping.cpu().numpy()
return blk_table_tensor, slot_mapping
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
@@ -2364,6 +2382,9 @@ class NPUModelRunner(GPUModelRunner):
if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches)
if self.model_config.enable_return_routed_experts:
self.init_routed_experts_capturer()
def _align_memory(self, tensor: torch.Tensor, alignment: int) -> torch.Tensor:
data_ptr = tensor.data_ptr()
aligned_addr = (data_ptr + alignment - 1) // alignment * alignment