upgrade main to 0212 (#6712)
### What this PR does / why we need it?
Fixes `transformers_utils/processors/__init__` import error, due to
https://github.com/vllm-project/vllm/pull/33247
Fixes Fused MoE break introduced by `MoERunner abstraction,` due to
https://github.com/vllm-project/vllm/pull/32344
> delete AscendMoERunnere when
https://github.com/vllm-project/vllm/pull/35178 is merged
Fixes `Make Qwen3VL compatible with Transformers v5`, due to
https://github.com/vllm-project/vllm/pull/34262
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
|
||||
prev_draft_token_indices.extend(range(start, start + draft_len))
|
||||
indices_match &= prev_index == flattened_index
|
||||
max_flattened_index = max(max_flattened_index, flattened_index)
|
||||
num_commmon_tokens = len(sample_flattened_indices)
|
||||
num_common_tokens = len(sample_flattened_indices)
|
||||
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
|
||||
if num_commmon_tokens < total_without_spec:
|
||||
if num_common_tokens < total_without_spec:
|
||||
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||
if self.enable_prompt_embeds:
|
||||
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
|
||||
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||
if num_commmon_tokens == 0:
|
||||
if num_common_tokens == 0:
|
||||
return
|
||||
if indices_match and max_flattened_index == (num_commmon_tokens - 1):
|
||||
if indices_match and max_flattened_index == (num_common_tokens - 1):
|
||||
# NOTE: Override the copy_ function here
|
||||
indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
|
||||
source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
|
||||
indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
|
||||
source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
|
||||
self.input_ids.gpu.index_copy_(0, indices, source)
|
||||
if self.enable_prompt_embeds:
|
||||
self.is_token_ids.gpu[:num_commmon_tokens] = True
|
||||
self.is_token_ids.gpu[:num_common_tokens] = True
|
||||
return
|
||||
# Upload the index tensors asynchronously so the scatter can be non-blocking.
|
||||
sampled_tokens_index_tensor = torch.tensor(
|
||||
|
||||
Reference in New Issue
Block a user