[Bugfix] Add max_num_batched_tokens to InputBatch to make main CI pass (#806)

### What this PR does / why we need it?

1. Fix V1 error found by
[nightly_ci](https://github.com/vllm-project/vllm-ascend/actions/runs/14950004754/job/41998136610),
broken by [[v1] Pass BlockTable and KVCacheSpec to
AttentionMetadataBuilders
#17483](https://github.com/vllm-project/vllm/pull/17483), make
`InputBatch` parameter consistent with vllm.
2. Disable benmark and fix it in upstream.

### Does this PR introduce _any_ user-facing change?

No


### How was this patch tested?

CI passed

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
Li Wang
2025-05-12 00:36:56 +08:00
committed by GitHub
parent 218f21de21
commit cdece86f2c
2 changed files with 23 additions and 8 deletions

View File

@@ -39,6 +39,8 @@ norecursedirs =
vllm-empty/tests/neuron vllm-empty/tests/neuron
; fastsafetensors not support npu now ; fastsafetensors not support npu now
vllm-empty/tests/fastsafetensors_loader vllm-empty/tests/fastsafetensors_loader
; Enable after https://github.com/vllm-project/vllm-ascend/issues/808 resolved
vllm-empty/tests/benchmarks
addopts = --ignore=vllm-empty/tests/test_utils.py addopts = --ignore=vllm-empty/tests/test_utils.py
--ignore=vllm-empty/tests/test_config.py --ignore=vllm-empty/tests/test_config.py

View File

@@ -55,6 +55,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
from vllm_ascend.attention.attention import AttentionMaskBuilder from vllm_ascend.attention.attention import AttentionMaskBuilder
from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.platform import NPUPlatform from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import vllm_version_is
if TYPE_CHECKING: if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped] import xgrammar as xgr # type: ignore[import-untyped]
@@ -187,14 +188,26 @@ class NPUModelRunner:
# Request states. # Request states.
self.requests: Dict[str, CachedRequestState] = {} self.requests: Dict[str, CachedRequestState] = {}
# Persistent batch. # Persistent batch.
self.input_batch = InputBatch( # Remove this after we drop 0.8.5 support
max_num_reqs=self.max_num_reqs, if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
max_model_len=self.model_config.max_model_len, self.input_batch = InputBatch(
max_num_blocks_per_req=self.max_num_blocks_per_req, max_num_reqs=self.max_num_reqs,
device=self.device, max_model_len=self.model_config.max_model_len,
pin_memory=True, max_num_blocks_per_req=self.max_num_blocks_per_req,
vocab_size=self.model_config.get_vocab_size(), device=self.device,
) pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
)
else:
self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=self.model_config.max_model_len,
max_num_blocks_per_req=self.max_num_blocks_per_req,
max_num_batched_tokens=self.max_num_tokens,
device=self.device,
pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
)
self.input_ids = torch.zeros(self.max_num_tokens, self.input_ids = torch.zeros(self.max_num_tokens,
dtype=torch.int32, dtype=torch.int32,