[CI] Upgrade vLLM version (#3139)

Upgrade vLLM version to the newest commit.
- Fix the break change introduced by
969b4da3a6
- Add a patch to quick fix torhcair
de94289a98
- fix the ut error introduced by
de94289a98

Close: https://github.com/vllm-project/vllm-ascend/issues/3138


- vLLM version: v0.10.2
- vLLM main:
f225ea7dd9

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
wangxiyuan
2025-09-25 07:36:51 +08:00
committed by GitHub
parent 464270e4ca
commit a055183821
9 changed files with 105 additions and 15 deletions

View File

@@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
(self.max_num_tokens, self.model_config.get_hidden_size()),
dtype=self.dtype,
device=self.device)
# Set up Attention
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
self.model_config.is_attention_free,
use_mla=self.model_config.use_mla,
)
if vllm_version_is("0.10.2"):
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
self.model_config.is_attention_free,
use_mla=self.model_config.use_mla,
)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
)
if torch.version.cann.startswith("8.3"):
self.attn_mask_builder = AttentionMaskBuilder(
self.scheduler_config.max_num_batched_tokens, self.dtype,