diff --git a/pytest.ini b/pytest.ini index 8889df7..4b0a039 100644 --- a/pytest.ini +++ b/pytest.ini @@ -39,6 +39,8 @@ norecursedirs = vllm-empty/tests/neuron ; fastsafetensors not support npu now vllm-empty/tests/fastsafetensors_loader + ; Enable after https://github.com/vllm-project/vllm-ascend/issues/808 resolved + vllm-empty/tests/benchmarks addopts = --ignore=vllm-empty/tests/test_utils.py --ignore=vllm-empty/tests/test_config.py diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 9398da0..08475c4 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -55,6 +55,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm_ascend.attention.attention import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.platform import NPUPlatform +from vllm_ascend.utils import vllm_version_is if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] @@ -187,14 +188,26 @@ class NPUModelRunner: # Request states. self.requests: Dict[str, CachedRequestState] = {} # Persistent batch. - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - ) + # Remove this after we drop 0.8.5 support + if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + ) + else: + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + ) self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32,