upgrade main to 0212 (#6712)

### What this PR does / why we need it?
Fixes `transformers_utils/processors/__init__` import error, due to
https://github.com/vllm-project/vllm/pull/33247
Fixes Fused MoE break introduced by `MoERunner abstraction,` due to
https://github.com/vllm-project/vllm/pull/32344

> delete AscendMoERunnere when
https://github.com/vllm-project/vllm/pull/35178 is merged

Fixes `Make Qwen3VL compatible with Transformers v5`, due to
https://github.com/vllm-project/vllm/pull/34262

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
Icey
2026-02-25 09:17:29 +08:00
committed by GitHub
parent 0331f16a50
commit ee59429015
11 changed files with 167 additions and 32 deletions

View File

@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
prev_draft_token_indices.extend(range(start, start + draft_len))
indices_match &= prev_index == flattened_index
max_flattened_index = max(max_flattened_index, flattened_index)
num_commmon_tokens = len(sample_flattened_indices)
num_common_tokens = len(sample_flattened_indices)
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
if num_commmon_tokens < total_without_spec:
if num_common_tokens < total_without_spec:
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0:
if num_common_tokens == 0:
return
if indices_match and max_flattened_index == (num_commmon_tokens - 1):
if indices_match and max_flattened_index == (num_common_tokens - 1):
# NOTE: Override the copy_ function here
indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
self.input_ids.gpu.index_copy_(0, indices, source)
if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True
self.is_token_ids.gpu[:num_common_tokens] = True
return
# Upload the index tensors asynchronously so the scatter can be non-blocking.
sampled_tokens_index_tensor = torch.tensor(