upgrade main to 0212 (#6712)

### What this PR does / why we need it? Fixes `transformers_utils/processors/__init__` import error, due to https://github.com/vllm-project/vllm/pull/33247 Fixes Fused MoE break introduced by `MoERunner abstraction,` due to https://github.com/vllm-project/vllm/pull/32344 > delete AscendMoERunnere when https://github.com/vllm-project/vllm/pull/35178 is merged Fixes `Make Qwen3VL compatible with Transformers v5`, due to https://github.com/vllm-project/vllm/pull/34262 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea --------- Signed-off-by: wxsIcey <1790571317@qq.com>
2026-02-25 09:17:29 +08:00
parent 0331f16a50
commit ee59429015
11 changed files with 167 additions and 32 deletions
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
                prev_draft_token_indices.extend(range(start, start + draft_len))
                indices_match &= prev_index == flattened_index
                max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(sample_flattened_indices)
+        num_common_tokens = len(sample_flattened_indices)
        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
-        if num_commmon_tokens < total_without_spec:
+        if num_common_tokens < total_without_spec:
            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
            if self.enable_prompt_embeds:
                self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
                self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
-        if num_commmon_tokens == 0:
+        if num_common_tokens == 0:
            return
-        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+        if indices_match and max_flattened_index == (num_common_tokens - 1):
            # NOTE: Override the copy_ function here
-            indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
-            source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
+            indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
+            source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
            self.input_ids.gpu.index_copy_(0, indices, source)
            if self.enable_prompt_embeds:
-                self.is_token_ids.gpu[:num_commmon_tokens] = True
+                self.is_token_ids.gpu[:num_common_tokens] = True
            return
        # Upload the index tensors asynchronously so the scatter can be non-blocking.
        sampled_tokens_index_tensor = torch.tensor(