Files
xc-llm-ascend/vllm_ascend/patch/worker/patch_routed_experts_capturer.py
Li-Yongwen 2870f7c8ad [Feat] Support routing replay (#6696)
### What this PR does / why we need it?

[Feat] Support routing replay
same as https://github.com/vllm-project/vllm-ascend/pull/6666
resubmit  because of DOC failure

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: liyongwen <1310439159@qq.com>
Signed-off-by: Li-Yongwen <63399187+Li-Yongwen@users.noreply.github.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-26 10:22:47 +08:00

69 lines
2.3 KiB
Python

import numpy as np
import torch
from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
_BUFFER_PREFIX,
_LOCK_FILE_PREFIX,
RoutedExpertsCapturer,
_create_or_attach_shared_memory,
logger,
)
from vllm.platforms import current_platform
def init_buffer(
self,
max_num_batched_tokens: int,
max_num_kv_tokens: int,
vllm_config: VllmConfig,
) -> None:
"""
Initialize the device buffer and optionally shared memory buffer.
Args:
max_num_batched_tokens: Maximum number of tokens in a batch.
max_num_kv_tokens: Maximum number of KV tokens for shared memory.
vllm_config: vllm configuration containing layer and expert info.
"""
if self._device_buffer is not None:
raise RuntimeError("Device buffer has already been initialized")
hf_config = vllm_config.model_config.hf_text_config
num_layers = hf_config.num_hidden_layers
num_experts_per_tok = hf_config.num_experts_per_tok
# Initialize device buffer
self._device_buffer = torch.zeros(
(max_num_batched_tokens, num_layers, num_experts_per_tok),
dtype=torch.int32,
device=current_platform.device_name,
)
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
if get_tensor_model_parallel_rank() != 0:
return
# Initialize shared memory
shape = (max_num_kv_tokens, num_layers, num_experts_per_tok)
buffer_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize
instance_id = vllm_config.instance_id
self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}_{self.dp_rank}.lock"
shm_name = f"{_BUFFER_PREFIX}_{instance_id}_{self.dp_rank}"
self._shm = _create_or_attach_shared_memory(shm_name, buffer_size, self._lock_file)
self._host_buffer_view = np.ndarray(shape, dtype=np.int32, buffer=self._shm.buf)
self._host_buffer_view.fill(0)
logger.debug(
"Created shared memory buffer '%s' with shape %s",
shm_name,
shape,
)
# Patch for _device_buffer's initialization(device="cuda" -> device=current_platform.device_name).
# TODO Remove this patch when pr(https://github.com/vllm-project/vllm/pull/34336) is merged.
RoutedExpertsCapturer.init_buffer = init_buffer