[Bugfix][Model] Fix fusedmoe and make modelrunner_v1 compatible with latest vllm (#867)

### What this PR does / why we need it?
this PR fix CI failure broken by vllm.
1. add moe_config for fused_moe
2. adjust the change for kv cache group from vllm. currently vllm-ascend
doesn't support this feature. this is just a quick fix for backward
compatibility

fix: #872

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-05-16 12:14:55 +08:00
committed by GitHub
parent fd515cd60b
commit 7a325b2e2d
4 changed files with 137 additions and 79 deletions

View File

@@ -111,8 +111,10 @@ class NPUModelRunner:
self.scheduler_config = vllm_config.scheduler_config
self.chunked_prefill_enabled = vllm_config.scheduler_config.chunked_prefill_enabled
self.device = device
self.is_multimodal_model = self.model_config.is_multimodal_model
self.block_size = vllm_config.cache_config.block_size
self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len,
self.block_size)
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
@@ -155,24 +157,6 @@ class NPUModelRunner:
raise NotImplementedError(
"Non-Attention backend is not supported by V1 NPUModelRunner.")
self.attn_backend = get_attn_backend(
self.head_size,
self.dtype,
self.kv_cache_dtype,
self.block_size,
self.model_config.is_attention_free,
use_mla=self.model_config.use_mla,
)
if self.attn_backend is None:
error_msg = (
f"Error with get_att_backend: {self.head_size=}, "
f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, "
f"{self.model_config.is_attention_free=}, "
f"{self.model_config.use_mla=}")
logger.error(error_msg)
raise NotImplementedError(
"Non-Attention backend is not supported by V1 GPUModelRunner.")
self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
weakref.proxy(self))
@@ -205,17 +189,6 @@ class NPUModelRunner:
pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
)
else:
self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=self.model_config.max_model_len,
max_num_blocks_per_req=self.max_num_blocks_per_req,
max_num_batched_tokens=self.max_num_tokens,
device=self.device,
pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
)
self.input_ids = torch.zeros(self.max_num_tokens,
dtype=torch.int32,
device=self.device)
@@ -562,7 +535,10 @@ class NPUModelRunner:
block_table_indices = (req_indices * self.max_num_blocks_per_req +
positions_np // self.block_size)
block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
else:
block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
block_offsets = positions_np % self.block_size
np.add(block_numbers * self.block_size,
@@ -976,6 +952,17 @@ class NPUModelRunner:
"""
import torch_npu
kv_caches: Dict[str, torch.Tensor] = {}
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=self.model_config.max_model_len,
max_num_batched_tokens=self.max_num_tokens,
device=self.device,
pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
kv_cache_config=kv_cache_config,
)
for kv_cache_group in kv_cache_config.kv_cache_groups:
kv_cache_spec = kv_cache_group.kv_cache_spec
for layer_name in kv_cache_group.layer_names: