[Bugfix] Add max_num_batched_tokens to InputBatch to make main CI pass (#806)
### What this PR does / why we need it? 1. Fix V1 error found by [nightly_ci](https://github.com/vllm-project/vllm-ascend/actions/runs/14950004754/job/41998136610), broken by [[v1] Pass BlockTable and KVCacheSpec to AttentionMetadataBuilders #17483](https://github.com/vllm-project/vllm/pull/17483), make `InputBatch` parameter consistent with vllm. 2. Disable benmark and fix it in upstream. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
@@ -39,6 +39,8 @@ norecursedirs =
|
|||||||
vllm-empty/tests/neuron
|
vllm-empty/tests/neuron
|
||||||
; fastsafetensors not support npu now
|
; fastsafetensors not support npu now
|
||||||
vllm-empty/tests/fastsafetensors_loader
|
vllm-empty/tests/fastsafetensors_loader
|
||||||
|
; Enable after https://github.com/vllm-project/vllm-ascend/issues/808 resolved
|
||||||
|
vllm-empty/tests/benchmarks
|
||||||
|
|
||||||
addopts = --ignore=vllm-empty/tests/test_utils.py
|
addopts = --ignore=vllm-empty/tests/test_utils.py
|
||||||
--ignore=vllm-empty/tests/test_config.py
|
--ignore=vllm-empty/tests/test_config.py
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
|||||||
from vllm_ascend.attention.attention import AttentionMaskBuilder
|
from vllm_ascend.attention.attention import AttentionMaskBuilder
|
||||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||||
from vllm_ascend.platform import NPUPlatform
|
from vllm_ascend.platform import NPUPlatform
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import xgrammar as xgr # type: ignore[import-untyped]
|
import xgrammar as xgr # type: ignore[import-untyped]
|
||||||
@@ -187,14 +188,26 @@ class NPUModelRunner:
|
|||||||
# Request states.
|
# Request states.
|
||||||
self.requests: Dict[str, CachedRequestState] = {}
|
self.requests: Dict[str, CachedRequestState] = {}
|
||||||
# Persistent batch.
|
# Persistent batch.
|
||||||
self.input_batch = InputBatch(
|
# Remove this after we drop 0.8.5 support
|
||||||
max_num_reqs=self.max_num_reqs,
|
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||||
max_model_len=self.model_config.max_model_len,
|
self.input_batch = InputBatch(
|
||||||
max_num_blocks_per_req=self.max_num_blocks_per_req,
|
max_num_reqs=self.max_num_reqs,
|
||||||
device=self.device,
|
max_model_len=self.model_config.max_model_len,
|
||||||
pin_memory=True,
|
max_num_blocks_per_req=self.max_num_blocks_per_req,
|
||||||
vocab_size=self.model_config.get_vocab_size(),
|
device=self.device,
|
||||||
)
|
pin_memory=True,
|
||||||
|
vocab_size=self.model_config.get_vocab_size(),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.input_batch = InputBatch(
|
||||||
|
max_num_reqs=self.max_num_reqs,
|
||||||
|
max_model_len=self.model_config.max_model_len,
|
||||||
|
max_num_blocks_per_req=self.max_num_blocks_per_req,
|
||||||
|
max_num_batched_tokens=self.max_num_tokens,
|
||||||
|
device=self.device,
|
||||||
|
pin_memory=True,
|
||||||
|
vocab_size=self.model_config.get_vocab_size(),
|
||||||
|
)
|
||||||
|
|
||||||
self.input_ids = torch.zeros(self.max_num_tokens,
|
self.input_ids = torch.zeros(self.max_num_tokens,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
|
|||||||
Reference in New Issue
Block a user