Upgrade to 0.11.1 newest vllm commit (#3762)

### What this PR does / why we need it?

c9461e05a4

Fix ```spec decode rejection sampler```, caused by
https://github.com/vllm-project/vllm/pull/26060
Fix some ```import```, caused by
https://github.com/vllm-project/vllm/pull/27374
Fix ```scheduler_config.send_delta_data```, caused by
https://github.com/vllm-project/vllm-ascend/pull/3719
Fix ```init_with_cudagraph_sizes```, caused by
https://github.com/vllm-project/vllm/pull/26016
Fix ```vl model```of replacing PatchEmbed's conv3d to linear layer,
caused by https://github.com/vllm-project/vllm/pull/27418

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with new added/existing test.


- vLLM version: v0.11.0rc3
- vLLM main:
c9461e05a4

---------

Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
Icey
2025-10-28 14:55:03 +08:00
committed by GitHub
parent f846bd20e4
commit a7450db1bd
12 changed files with 175 additions and 51 deletions

View File

@@ -72,7 +72,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import cdiv, is_pin_memory_available
from vllm.utils import cdiv
from vllm.utils.jsontree import json_map_leaves
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import (
@@ -159,13 +159,14 @@ else:
if vllm_version_is("0.11.0"):
from vllm.attention.layer import Attention
from vllm.config import CompilationLevel
from vllm.utils import LazyLoader
from vllm.utils import LazyLoader, is_pin_memory_available
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
else:
from vllm.attention.layer import MLAAttention
from vllm.config import CompilationMode
from vllm.utils.import_utils import LazyLoader
from vllm.utils.platform_utils import is_pin_memory_available
if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped]
@@ -386,7 +387,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.drafter = get_spec_decode_method(
self.speculative_config.method, self.vllm_config,
self.device, self)
self.rejection_sampler = AscendRejectionSampler()
if vllm_version_is("0.11.0"):
self.rejection_sampler = AscendRejectionSampler()
else:
self.rejection_sampler = AscendRejectionSampler(
self.sampler)
self.actual_seq_lengths_q = list(
range(self.decode_token_per_req, self.max_num_tokens + 1,
self.decode_token_per_req))
@@ -1885,6 +1890,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# TODO: Optimize the CPU -> NPU copy.
cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
self.device, non_blocking=True)
if not vllm_version_is("0.11.0"):
cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
self.device, non_blocking=True)
logits_indices = torch.from_numpy(logits_indices).to(self.device,
non_blocking=True)
target_logits_indices = torch.from_numpy(target_logits_indices).to(
@@ -1896,15 +1904,25 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# draft_token_indices: [ 1, 2, 3, 105, 106, 208]
draft_token_ids = self.input_ids[logits_indices]
draft_token_ids = draft_token_ids[target_logits_indices + 1]
metadata = SpecDecodeMetadata(
draft_token_ids=draft_token_ids,
num_draft_tokens=num_draft_tokens.tolist(),
cu_num_draft_tokens=cu_num_draft_tokens,
target_logits_indices=target_logits_indices,
bonus_logits_indices=bonus_logits_indices,
logits_indices=logits_indices,
)
if vllm_version_is("0.11.0"):
metadata = SpecDecodeMetadata(
draft_token_ids=draft_token_ids,
num_draft_tokens=num_draft_tokens.tolist(),
cu_num_draft_tokens=cu_num_draft_tokens,
target_logits_indices=target_logits_indices,
bonus_logits_indices=bonus_logits_indices,
logits_indices=logits_indices,
)
else:
metadata = SpecDecodeMetadata(
draft_token_ids=draft_token_ids,
num_draft_tokens=num_draft_tokens.tolist(),
cu_num_draft_tokens=cu_num_draft_tokens,
cu_num_sampled_tokens=cu_num_sampled_tokens,
target_logits_indices=target_logits_indices,
bonus_logits_indices=bonus_logits_indices,
logits_indices=logits_indices,
)
return metadata
def apply_grammar_bitmask(