upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542
The model structure modifications we involved in are:
     - Qwen2.5-VL(still exist some patch)
     - Qwen2-VL
     - Qwen2
     - DeepSeek series
     - Qwen-moe series
2. fix https://github.com/vllm-project/vllm/pull/29121
   the output token now  type changed from np to `list[list[int]]`

3. fix https://github.com/vllm-project/vllm/pull/29262
    `xformers` backend for multimodal now has been deprecated
4. fix https://github.com/vllm-project/vllm/pull/29342

5. fix https://github.com/vllm-project/vllm/pull/28579
6. fix https://github.com/vllm-project/vllm/pull/28718
7. fix https://github.com/vllm-project/vllm/issues/28665
8. fix https://github.com/vllm-project/vllm/pull/26847
vllm introduced the `optimization-level`, some default config has been
changed, and the param `--enforce-eager` has been deprecated
9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple
for sampler.
10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the
related patch to avoid this kind of error.

Co-authored-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>


- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
wangxiyuan
2025-12-02 22:10:52 +08:00
committed by GitHub
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions

View File

@@ -276,7 +276,7 @@ class AscendAttentionMetadataBuilder:
AscendAttentionMetadataBuilder.reorder_batch_threshold = self.decode_threshold
scheduler_config = vllm_config.scheduler_config
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
def reorder_batch(self, input_batch,
scheduler_output: "SchedulerOutput") -> bool:

View File

@@ -226,7 +226,7 @@ class AscendMLAMetadataBuilder:
self.block_size = vllm_config.cache_config.block_size
self.max_blocks = (vllm_config.model_config.max_model_len +
self.block_size - 1) // self.block_size
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
self.speculative_config = vllm_config.speculative_config
self.decode_threshold = 1

View File

@@ -456,7 +456,7 @@ class RecomputeScheduler(SchedulerInterface):
# chunked prefill has to be enabled explicitly to allow
# pooling requests to be chunked
if not self.scheduler_config.chunked_prefill_enabled and \
if not self.scheduler_config.enable_chunked_prefill and \
num_new_tokens > token_budget:
self.waiting.pop_request()
skipped_waiting_requests.prepend_request(request)

View File

@@ -70,7 +70,7 @@ class AscendScheduler(Scheduler):
self._initialize_common()
def schedule(self) -> SchedulerOutput:
if self.scheduler_config.chunked_prefill_enabled:
if self.scheduler_config.enable_chunked_prefill:
return super().schedule()
scheduled_new_reqs: list[Request] = []
scheduled_resumed_reqs: list[Request] = []
@@ -534,7 +534,7 @@ class AscendScheduler(Scheduler):
return True
def _get_prompt_limit(self, request: Request) -> int:
if (self.scheduler_config.chunked_prefill_enabled
if (self.scheduler_config.enable_chunked_prefill
and not self.scheduler_config.is_multi_step):
prompt_limit = self.vllm_config.model_config.max_model_len
else:

View File

@@ -404,7 +404,7 @@ class SchedulerDynamicBatch(Scheduler):
# chunked prefill has to be enabled explicitly to allow
# pooling requests to be chunked
if not self.scheduler_config.chunked_prefill_enabled and \
if not self.scheduler_config.enable_chunked_prefill and \
num_new_tokens > token_budget:
self.waiting.pop_request()
skipped_waiting_requests.prepend_request(request)

View File

@@ -9,14 +9,14 @@ from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional, Sequence
import torch
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import get_pp_group, get_tp_group
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.utils import logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
MLAAttentionSpec)

View File

@@ -2,7 +2,8 @@ import time
from collections import defaultdict
from typing import Optional
from vllm.utils import logger, sha256
from vllm.logger import logger
from vllm.utils.hashing import sha256
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
PrefixCachingMetrics)

View File

@@ -9,7 +9,7 @@ import torch
import vllm.envs as envs
import zmq
from vllm.config import KVTransferConfig, VllmConfig
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
from vllm.v1.kv_cache_interface import AttentionSpec

View File

@@ -8,7 +8,7 @@ from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.forward_context import ForwardContext
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput

View File

@@ -3,7 +3,7 @@ from enum import Enum
import torch
from vllm.config import ParallelConfig
from vllm.utils import logger
from vllm.logger import logger
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -7,7 +7,7 @@ from typing import Union
# Third Party
from vllm.config import ParallelConfig
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import get_ip
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -3,7 +3,7 @@ from typing import Iterable, List, Optional, Tuple, Union
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
KVConnectorMetadata
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.math_utils import cdiv
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm.v1.core.sched.output import NewRequestData

View File

@@ -4,7 +4,7 @@ from concurrent.futures import ThreadPoolExecutor
from typing import Any, Optional
import torch
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -5,7 +5,7 @@ import zmq
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
KVConnectorMetadata
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.kv_cache_utils import BlockHash

View File

@@ -8,7 +8,7 @@ from vllm.distributed import (get_decode_context_model_parallel_rank,
get_decode_context_model_parallel_world_size,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -25,7 +25,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
get_world_group)
from vllm.forward_context import ForwardContext
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig

View File

@@ -29,7 +29,7 @@ from vllm.distributed.parallel_state import (
get_decode_context_model_parallel_rank,
get_decode_context_model_parallel_world_size,
get_tensor_model_parallel_rank, get_tp_group)
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import RequestStatus

View File

@@ -27,7 +27,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group, get_world_group)
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig

View File

@@ -1,6 +1,6 @@
import numpy as np
import torch
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec

View File

@@ -23,7 +23,7 @@ from typing import Optional
import torch
from torch import nn
from vllm.attention import AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.layer import MLAAttention
from vllm.config import CacheConfig, get_current_vllm_config
from vllm.distributed import get_tensor_model_parallel_world_size

View File

@@ -27,8 +27,7 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import \
from transformers.models.qwen2_vl.configuration_qwen2_vl import \
Qwen2VLVisionConfig
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import (check_upstream_fa_availability,
maybe_get_vit_flash_attn_backend)
from vllm.attention.layer import maybe_get_vit_flash_attn_backend
from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -65,7 +64,6 @@ class AscendQwen2_5_VisionAttention(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: torch.Tensor,
seqlens: torch.Tensor = None,
) -> torch.Tensor:
# [s, b, c] --> [s, b, head * 3 * head_dim]
x, _ = self.qkv(x)
@@ -141,7 +139,6 @@ class AscendQwen2VisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
x = x + self.attn(
self.norm1(x),
@@ -149,7 +146,6 @@ class AscendQwen2VisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
seqlens=seqlens,
)
x = x + self.mlp(self.norm2(x))
return x
@@ -198,7 +194,6 @@ class AscendQwen2VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
@@ -228,10 +223,6 @@ class AscendQwen2VisionTransformer(nn.Module):
attn_backend_override=attn_backend_override,
)
if (self.attn_backend != AttentionBackendEnum.FLASH_ATTN
and check_upstream_fa_availability(torch.get_default_dtype())):
self.attn_backend = AttentionBackendEnum.FLASH_ATTN
def rot_pos_emb(
self,
grid_thw: list[list[int]]) -> tuple[torch.Tensor, torch.Tensor]:
@@ -300,7 +291,7 @@ class AscendQwen2VisionTransformer(nn.Module):
x = x.unsqueeze(1)
# pre-compute seqlens for attn mask to reduce cuMemcpy operations
max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
for blk in self.blocks:
x = blk(
x,
@@ -308,7 +299,6 @@ class AscendQwen2VisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
seqlens=seqlens,
)
# adapter
@@ -326,7 +316,6 @@ class AscendQwen2_5_VisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: torch.Tensor, # Only used for Flash Attention
seqlens: torch.Tensor, # Only used for xFormers
) -> torch.Tensor:
x_attn = self.attn(
self.norm1(x),
@@ -334,7 +323,6 @@ class AscendQwen2_5_VisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
seqlens=seqlens,
)
x_fused_norm, residual = self.norm2(x, residual=x_attn)
x = residual + self.mlp(x_fused_norm)
@@ -388,11 +376,9 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
use_upstream_fa = False
self.attn_backend = get_vit_attn_backend(
head_size=head_dim,
dtype=torch.get_default_dtype(),
@@ -402,7 +388,6 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
self.attn_backend, self.flash_attn_varlen_func = (
maybe_get_vit_flash_attn_backend(
self.attn_backend,
use_upstream_fa,
attn_backend_override=attn_backend_override,
))
@@ -418,7 +403,6 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
prefix=f"{prefix}.blocks.{layer_idx}",
use_data_parallel=use_data_parallel,
attn_backend=self.attn_backend,
use_upstream_fa=use_upstream_fa,
attn_backend_override=attn_backend_override,
) for layer_idx in range(depth)
])
@@ -553,10 +537,8 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
# transformers
# pre-compute seqlens for window/full attn to reduce cuMemcpy operations
max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
cu_seqlens)
max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
cu_window_seqlens)
max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
cu_seqlens = cu_seqlens.to( # type: ignore[attr-defined]
device=self.device,
@@ -587,11 +569,9 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
if layer_num in self.fullatt_block_indexes:
cu_seqlens_now = cu_seqlens
max_seqlen_now = max_seqlen_full
seqlens_now = seqlens_full
else:
cu_seqlens_now = cu_window_seqlens
max_seqlen_now = max_seqlen_window
seqlens_now = seqlens_window
hidden_states = blk(
hidden_states,
@@ -599,7 +579,6 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen_now,
seqlens=seqlens_now,
)
# For Qwen2.5-VL-3B, float16 will overflow at last block

View File

@@ -23,7 +23,6 @@ import torch.nn as nn
from transformers.models.qwen3_vl.configuration_qwen3_vl import \
Qwen3VLVisionConfig
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import check_upstream_fa_availability
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -101,7 +100,6 @@ class AscendQwen3_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
@@ -133,17 +131,10 @@ class AscendQwen3_VisionTransformer(nn.Module):
dtype=torch.get_default_dtype(),
attn_backend_override=attn_backend_override,
)
use_upstream_fa = False
if (self.attn_backend != AttentionBackendEnum.FLASH_ATTN
and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
and check_upstream_fa_availability(torch.get_default_dtype())):
self.attn_backend = AttentionBackendEnum.FLASH_ATTN
use_upstream_fa = True
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -159,7 +150,6 @@ class AscendQwen3_VisionTransformer(nn.Module):
prefix=f"{prefix}.blocks.{layer_idx}",
use_data_parallel=use_data_parallel,
attn_backend=self.attn_backend,
use_upstream_fa=use_upstream_fa,
) for layer_idx in range(vision_config.depth)
])

View File

@@ -157,6 +157,7 @@ class NPUPlatform(Platform):
compilation_config.splitting_ops = []
compilation_config.cudagraph_num_of_warmups = 1
compilation_config.pass_config.enable_fusion = False
if compilation_config.mode not in [
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
@@ -310,7 +311,7 @@ class NPUPlatform(Platform):
vllm_config.scheduler_config.scheduler_cls = (
"vllm_ascend.core.scheduler_dynamic_batch.SchedulerDynamicBatch"
)
vllm_config.scheduler_config.chunked_prefill_enabled = True
vllm_config.scheduler_config.enable_chunked_prefill = True
vllm_config.scheduler_config.SLO_limits_for_dynamic_batch = ascend_config.SLO_limits_for_dynamic_batch
if vllm_config.kv_transfer_config is not None and \

View File

@@ -138,7 +138,8 @@ class EagleProposer(Proposer):
dummy_compute_logits(self.hidden_states)
def generate_token_ids(self,
valid_sampled_token_ids: list[np.ndarray],
valid_sampled_token_ids: torch.Tensor
| list[list[int]],
sampling_metadata: SamplingMetadata = None,
scheduler_output: SchedulerOutput = None,
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -151,7 +152,7 @@ class EagleProposer(Proposer):
attn_metadata = self._get_eagle_atten_dict(scheduler_output)
next_token_ids: list[int] = []
for i, token_ids in enumerate(valid_sampled_token_ids):
if token_ids.shape[0] > 0:
if token_ids:
# Common case.
next_token_id = token_ids[-1]
else:
@@ -163,7 +164,7 @@ class EagleProposer(Proposer):
scheduler_output.num_scheduled_tokens[req_id])
next_token_id = req_state.get_token_id(seq_len)
next_token_ids.append(next_token_id.item())
next_token_ids.append(next_token_id)
next_token_ids = torch.tensor(next_token_ids,
dtype=torch.int32,
device=self.device)
@@ -183,7 +184,7 @@ class EagleProposer(Proposer):
else:
num_draft_tokens = spec_decode_metadata.num_draft_tokens
num_rejected_tokens = [
n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
for i, n in enumerate(num_draft_tokens)
]
num_rejected_tokens = torch.tensor(

View File

@@ -1,7 +1,6 @@
import enum
from typing import Optional
import numpy as np
import torch
from vllm.config import CUDAGraphMode, VllmConfig
from vllm.v1.core.sched.output import SchedulerOutput
@@ -42,7 +41,7 @@ class Proposer:
raise NotImplementedError
def generate_token_ids(self,
valid_sampled_token_ids: list[np.ndarray],
valid_sampled_token_ids: list[list[int]],
sampling_metadata: SamplingMetadata = None,
scheduler_output: SchedulerOutput = None,
spec_decode_metadata: SpecDecodeMetadata = None,

View File

@@ -7,7 +7,7 @@ import torch.nn as nn
import torch.nn.functional as F
from vllm.config import (CUDAGraphMode, VllmConfig,
get_layers_from_vllm_config, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, get_forward_context
from vllm.forward_context import get_forward_context
from vllm.logger import init_logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model_loader
@@ -314,8 +314,7 @@ class MtpProposer(Proposer):
break
def generate_token_ids(self,
sampled_token_ids: Union[torch.Tensor,
list[np.ndarray]],
sampled_token_ids: torch.Tensor | list[list[int]],
sampling_metadata: SamplingMetadata = None,
scheduler_output: SchedulerOutput = None,
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -392,7 +391,6 @@ class MtpProposer(Proposer):
common_attn_metadata.query_start_loc = \
query_start_loc_pcp_full[:num_reqs + 1]
if self.speculative_config.disable_padded_drafter_batch:
assert isinstance(sampled_token_ids, list)
# NOTE: Currently, MTP-fullgraph is incompatibility with pcp
token_indices_to_sample = None
common_attn_metadata, token_indices =\
@@ -451,7 +449,7 @@ class MtpProposer(Proposer):
def _prepare_inputs(
self,
common_attn_metadata: CommonAttentionMetadata,
sampled_token_ids: list[np.ndarray],
sampled_token_ids: list[list[int]],
num_draft_tokens: list[int],
) -> tuple[CommonAttentionMetadata, torch.Tensor]:
"""
@@ -695,13 +693,11 @@ class MtpProposer(Proposer):
2))) and (scheduler_output.total_num_scheduled_tokens
== self.runner.input_batch.num_reqs *
(self.num_speculative_tokens + 1))
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=uniform_decode)
else:
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=False)
uniform_decode = False
has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
aclgraph_runtime_mode, batch_descriptor = \
self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
@@ -929,7 +925,7 @@ class MtpProposer(Proposer):
def prepare_next_token_ids_cpu(
self,
sampled_token_ids: list[np.ndarray],
sampled_token_ids: list[list[int]],
requests: dict[str, CachedRequestState],
gpu_input_batch: InputBatch,
num_scheduled_tokens: dict[str, int],
@@ -944,7 +940,7 @@ class MtpProposer(Proposer):
req_ids = gpu_input_batch.req_ids
next_token_ids: list[int] = []
for i, token_ids in enumerate(sampled_token_ids):
if token_ids.shape[0] > 0:
if token_ids:
# Common case.
next_token_id = token_ids[-1]
else:
@@ -955,7 +951,7 @@ class MtpProposer(Proposer):
seq_len = req_state.num_computed_tokens + num_scheduled_tokens[
req_id]
next_token_id = req_state.get_token_id(seq_len)
next_token_ids.append(next_token_id.item())
next_token_ids.append(next_token_id)
next_token_ids = torch.tensor(next_token_ids,
dtype=torch.int32,
device=self.input_ids.device)

View File

@@ -1,4 +1,3 @@
import numpy as np
import torch
from vllm.config import CUDAGraphMode
from vllm.v1.spec_decode.ngram_proposer import \
@@ -32,7 +31,7 @@ class NgramProposer(VllmNgramProposer, Proposer):
pass
def generate_token_ids(self,
valid_sampled_token_ids: list[np.ndarray],
valid_sampled_token_ids,
sampling_metadata=None,
scheduler_output=None,
spec_decode_metadata=None,
@@ -43,7 +42,7 @@ class NgramProposer(VllmNgramProposer, Proposer):
aux_hidden_states=None) -> list[list[int]]:
valid_ngram_requests = []
for i, sampled_ids in enumerate(valid_sampled_token_ids):
num_sampled_ids = sampled_ids.shape[0]
num_sampled_ids = len(sampled_ids)
if not num_sampled_ids:
continue

View File

@@ -23,7 +23,7 @@ import torch.nn.functional as F
import vllm
from torch import nn
from transformers import Qwen2Config
from vllm.attention import AttentionMetadata, AttentionType
from vllm.attention.backends.abstract import AttentionMetadata, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (get_pp_group, tensor_model_parallel_all_gather,
@@ -40,6 +40,7 @@ from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
from vllm.model_executor.models.utils import (AutoWeightsLoader,
PPMissingLayer, maybe_prefix)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -72,11 +73,10 @@ class CustomQwen2Attention(Qwen2Attention):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: Optional[dict[str, Any]] = None,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
rope_scaling: Optional[tuple] = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: Optional[dict[str, Any]] = None,
@@ -86,13 +86,13 @@ class CustomQwen2Attention(Qwen2Attention):
num_heads=num_heads,
num_kv_heads=num_kv_heads,
max_position=max_position,
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=prefix,
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config)
dual_chunk_attention_config=dual_chunk_attention_config,
rope_parameters=rope_parameters)
ascend_config = get_ascend_config()
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
@@ -145,9 +145,9 @@ class CustomQwen2DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1000000)
dual_chunk_attention_config = getattr(config,
"dual_chunk_attention_config",
None)
@@ -166,10 +166,9 @@ class CustomQwen2DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,

View File

@@ -21,7 +21,8 @@ from typing import Any, List, Optional, Union
import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, CompilationMode, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -137,8 +138,7 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: Optional[dict[str, Any]] = None,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 8192,
head_dim: Optional[int] = None,
rms_norm_eps: float = 1e-06,
@@ -167,7 +167,6 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(hidden_size,
@@ -188,8 +187,7 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(self.num_heads,
self.head_dim,
@@ -270,16 +268,13 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
nn.Module.__init__(self)
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings",
8192)
self.self_attn = CustomQwen3MoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, 'attention_bias', False),

View File

@@ -25,13 +25,13 @@
# # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
# """Inference-only DeepseekV2/DeepseekV3 model."""
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
from typing import Callable, Iterable, List, Optional, Tuple, Union
import torch
import torch_npu
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.layer import MLAAttention
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -492,8 +492,6 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
v_head_dim: int,
q_lora_rank: Optional[int],
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: Optional[Dict[str, Any]] = None,
max_position_embeddings: int = 8192,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
@@ -518,7 +516,6 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
self.first_k_dense_replace = config.first_k_dense_replace
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.prefix = prefix
@@ -592,17 +589,17 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
quant_config=quant_config,
prefix=f"{prefix}.o_proj")
if rope_scaling:
rope_scaling["rope_type"] = 'deepseek_yarn'
if config.rope_parameters["rope_type"] != "default":
config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False)
if rope_scaling:
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"]
if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = config.rope_parameters.get(
"mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale
@@ -708,8 +705,6 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
v_head_dim: int,
q_lora_rank: Optional[int],
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: Optional[Dict[str, Any]] = None,
max_position_embeddings: int = 8192,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
@@ -734,7 +729,6 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
self.first_k_dense_replace = config.first_k_dense_replace
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.prefix = prefix
@@ -814,17 +808,19 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
return_bias=False,
)
if rope_scaling:
rope_scaling["rope_type"] = 'deepseek_yarn'
self.rotary_emb = get_rope(qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
is_neox_style=False)
if rope_scaling:
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"]
if config.rope_parameters["rope_type"] != "default":
config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = config.rope_parameters.get(
"mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale
@@ -921,8 +917,6 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
) -> None:
nn.Module.__init__(self)
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings",
8192)
# DecoderLayers are created with `make_layers` which passes the prefix
@@ -955,8 +949,6 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
q_lora_rank=config.q_lora_rank
if hasattr(config, "q_lora_rank") else None,
kv_lora_rank=config.kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@@ -24,7 +24,8 @@ import torch_npu
from torch import nn
from torch.nn import Parameter
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (divide, get_pp_group,
@@ -539,8 +540,7 @@ class PanguProMoEAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: Optional[Dict[str, Any]] = None,
rope_parameters: Dict[str, Any],
max_position_embeddings: int = 8192,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
@@ -566,7 +566,6 @@ class PanguProMoEAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@@ -600,8 +599,7 @@ class PanguProMoEAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
@@ -654,8 +652,6 @@ class PanguProMoEDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings",
8192)
@@ -663,8 +659,7 @@ class PanguProMoEDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@@ -993,6 +993,7 @@ class TorchairAscendFusedMoE(FusedMoE):
tp_size=tp_size,
ep_size=ep_size,
dp_size=dp_size,
pcp_size=1,
prefix=prefix,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
@@ -1011,6 +1012,8 @@ class TorchairAscendFusedMoE(FusedMoE):
self.moe_parallel_config = FusedMoEParallelConfig.make(
tp_size_=(tp_size if tp_size is not None else
get_tensor_model_parallel_world_size()),
# TODO: support pcp
pcp_size_=1,
dp_size_=(dp_size
if dp_size is not None else get_dp_group().world_size),
vllm_parallel_config=vllm_config.parallel_config)

View File

@@ -170,7 +170,7 @@ class AscendMLATorchairMetadataBuilder:
self.block_size = vllm_config.cache_config.block_size
self.max_blocks = (vllm_config.model_config.max_model_len +
self.block_size - 1) // self.block_size
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
if self.chunked_prefill_enabled:
self.chunked_prefill_workspace_size = min(
# Max sure there is enough for 8 full length request or at least

View File

@@ -1,13 +1,12 @@
import types
import numpy as np
import torch
import torch.nn as nn
import torchair
from torchair import patch_for_hcom
from vllm.config import (CUDAGraphMode, VllmConfig,
get_layers_from_vllm_config, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, get_forward_context
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model_loader
from vllm.model_executor.model_loader.utils import \
@@ -149,7 +148,7 @@ class TorchairMtpProposer(MtpProposer):
break
def generate_token_ids(self,
valid_sampled_token_ids: list[np.ndarray],
valid_sampled_token_ids: list[list[int]],
sampling_metadata: SamplingMetadata = None,
scheduler_output: SchedulerOutput = None,
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -162,7 +161,7 @@ class TorchairMtpProposer(MtpProposer):
attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
next_token_ids: list[int] = []
for i, token_ids in enumerate(valid_sampled_token_ids):
if token_ids.shape[0] > 0:
if token_ids:
# Common case.
next_token_id = token_ids[-1]
else:
@@ -173,7 +172,7 @@ class TorchairMtpProposer(MtpProposer):
seq_len = (req_state.num_computed_tokens +
scheduler_output.num_scheduled_tokens[req_id])
next_token_id = req_state.get_token_id(seq_len)
next_token_ids.append(next_token_id.item())
next_token_ids.append(next_token_id)
next_token_ids = torch.tensor(next_token_ids,
dtype=torch.int32,
device=self.device)
@@ -189,7 +188,7 @@ class TorchairMtpProposer(MtpProposer):
# TODO(woosuk): Refactor this.
num_draft_tokens = spec_decode_metadata.num_draft_tokens
num_rejected_tokens = [
n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
for i, n in enumerate(num_draft_tokens)
]
num_rejected_tokens = torch.tensor(
@@ -343,12 +342,7 @@ class TorchairMtpProposer(MtpProposer):
# torchair mode can reuse self.runner.num_tokens_across_dp
num_tokens_across_dp = self.runner.num_tokens_across_dp
with_prefill = self.runner.with_prefill
moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=False)
aclgraph_runtime_mode, batch_descriptor = \
self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
for step in range(self.num_speculative_tokens):
with set_ascend_forward_context(
@@ -359,7 +353,6 @@ class TorchairMtpProposer(MtpProposer):
num_tokens_across_dp=num_tokens_across_dp,
reserved_mc2_mask=self.runner.reserved_mc2_mask,
moe_comm_type=moe_comm_type,
aclgraph_runtime_mode=aclgraph_runtime_mode,
in_profile_run=self.runner.in_profile_run,
num_actual_tokens=num_tokens):
with ProfileExecuteDuration().capture_async('mtp_forward'):

View File

@@ -171,7 +171,7 @@ class AscendSFATorchairMetadataBuilder:
self.block_size = vllm_config.cache_config.block_size
self.max_blocks = (vllm_config.model_config.max_model_len +
self.block_size - 1) // self.block_size
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
if self.chunked_prefill_enabled:
self.chunked_prefill_workspace_size = min(
# Max sure there is enough for 8 full length request or at least

View File

@@ -483,6 +483,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
compilation_config.cudagraph_capture_sizes, None
# Calculate parallel configuration factor
if not vllm_config.model_config:
logger.warning(
"Got empty model config. This typically occurs when an empty vllm_config is "
"initialized (e.g., in unit tests), where config updates are intentionally skipped."
)
return
hf_config = vllm_config.model_config.hf_config
if hasattr(hf_config, 'num_hidden_layers'):
num_hidden_layers = hf_config.num_hidden_layers

View File

@@ -39,9 +39,9 @@ import torch._dynamo.cache_size
import torch.distributed as dist
import torch.nn as nn
from tqdm import tqdm # type: ignore
from vllm.attention import AttentionType, get_attn_backend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
from vllm.attention.layer import Attention, MLAAttention
from vllm.attention.selector import get_attn_backend
from vllm.compilation.counter import compilation_counter
from vllm.compilation.monitor import set_cudagraph_capturing_enabled
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
@@ -53,7 +53,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
from vllm.distributed.parallel_state import (get_dcp_group, get_dp_group,
get_pp_group, get_tp_group,
is_global_first_rank)
from vllm.forward_context import BatchDescriptor, get_forward_context
from vllm.forward_context import get_forward_context
from vllm.logger import logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.mamba.abstract import MambaBase
@@ -244,11 +244,9 @@ class AsyncNPUModelRunnerOutput(AsyncModelRunnerOutput):
# Release the device tensor once the copy has completed
del self._sampled_token_ids
valid_sampled_token_ids: list[np.ndarray] = [
row for row in self._sampled_token_ids_cpu.numpy()
]
valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
for i in self._invalid_req_indices:
valid_sampled_token_ids[i] = np.array([])
valid_sampled_token_ids[i].clear()
output = self._model_runner_output
output.sampled_token_ids = valid_sampled_token_ids
@@ -332,7 +330,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Ascend-specific configurations
self.ascend_config = get_ascend_config()
if self.ascend_config.ascend_scheduler_config.enabled:
self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
self.chunked_prefill_enabled = self.scheduler_config.enable_chunked_prefill
else:
self.chunked_prefill_enabled = True
self.weight_prefetch_method = WeightPrefetchMethod(
@@ -2130,7 +2128,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
def propose_draft_token_ids(
self,
valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
valid_sampled_token_ids: torch.Tensor | list[list[int]],
sampling_metadata: SamplingMetadata,
scheduler_output: "SchedulerOutput",
spec_decode_metadata: SpecDecodeMetadata,
@@ -2309,10 +2307,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
scheduler_output.total_num_scheduled_tokens
== self.input_batch.num_reqs * max_query_len)
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=uniform_decode)
has_lora = len(self.input_batch.lora_id_to_lora_request) > 0
aclgraph_runtime_mode, batch_descriptor = \
self.aclgraph_dispatcher.dispatch(batch_descriptor)
self.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
# Run forward pass
with ProfileExecuteDuration().capture_async("forward"):
@@ -2510,18 +2507,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
max_gen_len = sampled_token_ids.shape[-1]
if max_gen_len == 1:
# No spec decode tokens. It's a tensor.
valid_sampled_token_ids: list[np.ndarray] = [
row for row in sampled_token_ids.cpu().numpy()
]
valid_sampled_token_ids = sampled_token_ids.tolist()
else:
# Includes spec decode tokens. It's a numpy array
valid_sampled_token_ids = self.rejection_sampler.parse_output(
valid_sampled_token_ids, _ = self.rejection_sampler.parse_output(
sampled_token_ids,
self.input_batch.vocab_size,
)
# Mask out the sampled tokens that should not be sampled.
for i in discard_sampled_tokens_req_indices:
valid_sampled_token_ids[int(i)] = np.array([])
valid_sampled_token_ids[int(i)].clear()
else:
valid_sampled_token_ids = []
invalid_req_indices = discard_sampled_tokens_req_indices.tolist(
@@ -2547,17 +2542,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# the sampled tokens back, because there's no direct communication
# between the first-stage worker and the last-stage worker.
for req_idx in range(num_sampled_tokens):
sampled_ids: np.ndarray | None
if self.use_async_scheduling:
sampled_ids = (np.array([-1]) if req_idx
not in invalid_req_indices_set else None)
sampled_ids = [-1] * 1 if \
req_idx not in invalid_req_indices_set else None
else:
sampled_ids = valid_sampled_token_ids[req_idx]
if sampled_ids is None or sampled_ids.shape[0] == 0:
if not sampled_ids:
continue
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
end_idx = start_idx + sampled_ids.shape[0]
end_idx = start_idx + len(sampled_ids)
assert end_idx <= self.model_config.max_model_len, (
"Sampled token IDs exceed the max model length. "
f"Total number of tokens: {end_idx} > max_model_len: "
@@ -2571,7 +2565,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.input_batch.num_tokens[req_idx] = end_idx
req_id = self.input_batch.req_ids[req_idx]
req_state = self.requests[req_id]
req_state.output_token_ids.extend(sampled_ids.tolist())
req_state.output_token_ids.extend(sampled_ids)
def propose_draft_token_ids(sampled_token_ids):
assert self.spec_decode_common_attn_metadata is not None
@@ -2877,7 +2871,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
assert aclgraph_runtime_mode is None or aclgraph_runtime_mode in {
CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
}
# In multi-DP scenarios, there may be situations where all DP groups are executing dummy runs.
# If sequence parallelism is enabled, it is essential to ensure that num_tokens is divisible by tp_size.
if self.use_aclgraph and enable_sp(self.vllm_config):
@@ -2971,19 +2964,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
k: v[:num_tokens]
for k, v in self.intermediate_tensors.items()
})
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
# filter out the valid batch descriptor
_ag_mode, batch_descriptor = \
self.aclgraph_dispatcher.dispatch(
BatchDescriptor(num_tokens=num_tokens,
uniform_decode=uniform_decode))
self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
if aclgraph_runtime_mode is not None:
# we allow forcing NONE when the dispatcher disagrees to support
# warm ups for aclgraph capture
assert aclgraph_runtime_mode == CUDAGraphMode.NONE or \
aclgraph_runtime_mode == _ag_mode, (
f"Aclgraph runtime mode mismatch at dummy_run. "
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}.")
if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
raise ValueError(
f"Aclgraph runtime mode mismatch at dummy_run. "
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}."
)
else:
aclgraph_runtime_mode = _ag_mode
@@ -4466,18 +4458,3 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
non_blocking=True,
)
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
# This is a short term mitigation for issue mentioned in
# https://github.com/vllm-project/vllm/issues/22754.
# `tolist` would trigger a cuda wise stream sync, which
# would block other copy ops from other cuda streams.
# A cuda event sync would avoid such a situation. Since
# this is in the critical path of every single model
# forward loop, this has caused perf issue for a disagg
# setup.
pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
pinned.copy_(sampled_token_ids, non_blocking=True)
self.transfer_event.record()
self.transfer_event.synchronize()
return [row for row in pinned.numpy()]