Upgrade to new vllm commit (#3719)

### What this PR does / why we need it?
Upgrade to new vllm commit:
c9461e05a4

- Fix many imports, caused by
https://github.com/vllm-project/vllm/pull/26908
- Fix import ```sha256```, caused by
https://github.com/vllm-project/vllm/pull/27169
- Remove ```SchedulerConfig.send_delta_data```, caused by
https://github.com/vllm-project/vllm/pull/27142
- Fix ```FusedMoE``` because of dual stream execution, caused by
https://github.com/vllm-project/vllm/pull/26440

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with new added/existing test.


- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Icey
2025-10-25 15:36:32 +08:00
committed by GitHub
parent 226f832c0b
commit d9cdc65854
37 changed files with 229 additions and 71 deletions

View File

@@ -99,9 +99,6 @@ class AscendSchedulerConfig(SchedulerConfig):
raise NotImplementedError(
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
)
if self.send_delta_data:
raise NotImplementedError(
"currently AscendScheduler doesn't support send_delta_data.")
if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor."

View File

@@ -9,11 +9,18 @@ import torch
import vllm.envs as envs
import zmq
from vllm.config import KVTransferConfig, VllmConfig
from vllm.utils import get_dtype_size, logger, make_zmq_socket
from vllm.utils import logger
from vllm.v1.kv_cache_interface import AttentionSpec
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
CPUKVCacheManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_dtype_size, make_zmq_socket
else:
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
@dataclass

View File

@@ -25,19 +25,25 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
get_world_group)
from vllm.forward_context import ForwardContext
from vllm.utils import get_ip, logger
from vllm.utils import logger
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import Request, RequestStatus
import vllm_ascend.envs as envs_ascend
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
prefill_context_parallel_enable)
prefill_context_parallel_enable,
vllm_version_is)
if prefill_context_parallel_enable():
from vllm.distributed.parallel_state import \
get_prefill_context_model_parallel_rank
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip
TORCH_DTYPE_TO_NPU_DTYPE = {
torch.half: llm_datadist.DataType.DT_FLOAT16,
torch.float16: llm_datadist.DataType.DT_FLOAT16,

View File

@@ -7,7 +7,7 @@ from typing import Generator, List, Optional, Union
# Third Party
import torch
from vllm.config import VllmConfig
from vllm.utils import get_kv_cache_torch_dtype, logger
from vllm.utils import logger
from vllm_ascend.distributed.mooncake.config_data import (
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
@@ -16,6 +16,12 @@ from vllm_ascend.distributed.mooncake.kv_transfer import (
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_kv_cache_torch_dtype
else:
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
class MooncakeEngine:

View File

@@ -26,13 +26,19 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group)
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
from vllm.utils import logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import RequestStatus
import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
else:
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata

View File

@@ -26,7 +26,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group, get_world_group)
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
from vllm.utils import logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import RequestStatus
@@ -34,6 +34,12 @@ import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.utils import (align_memory,
kv_alltoall_and_rearrange)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
else:
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata

View File

@@ -28,12 +28,19 @@ from vllm.model_executor.model_loader import register_model_loader
from vllm.model_executor.model_loader.base_loader import BaseModelLoader
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
from vllm.model_executor.model_loader.utils import (
initialize_model, process_weights_after_loading, set_default_torch_dtype)
initialize_model, process_weights_after_loading)
from vllm_ascend.utils import vllm_version_is
from .interaction.elastic import ElasticServer
from .load import elastic_load
from .utils import find_free_port, is_valid_path_prefix
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
else:
from vllm.utils.torch_utils import set_default_torch_dtype
@register_model_loader("netloader")
class ModelNetLoaderElastic(BaseModelLoader):
@@ -200,7 +207,10 @@ class ModelNetLoaderElastic(BaseModelLoader):
if model is not None and (
(self.listen_port and self.listen_port in range(1024, 65535)) or
(self.listen_port is None)):
from vllm.utils import get_ip
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip
driver_ip = get_ip()
if driver_ip == '0.0.0.0':

View File

@@ -29,7 +29,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.mla import MLAModules
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is
@@ -38,9 +37,11 @@ if vllm_version_is("0.11.0"):
from vllm.attention import Attention
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
from vllm.utils import direct_register_custom_op
else:
from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
from vllm.utils.torch_utils import direct_register_custom_op
if vllm_version_is("0.11.0"):
from vllm.attention import Attention

View File

@@ -31,7 +31,6 @@ from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.mla import MLAModules
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is
@@ -40,9 +39,11 @@ if vllm_version_is("0.11.0"):
from vllm.attention import Attention
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
from vllm.utils import direct_register_custom_op
else:
from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
from vllm.utils.torch_utils import direct_register_custom_op
@dataclass

View File

@@ -435,10 +435,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
def __init__(
self,
shared_experts: torch.nn.Module,
gate: Optional[torch.nn.Module] = None,
use_overlapped: bool = True,
**kwargs,
):
AscendFusedMoE.__init__(self, **kwargs)
self._shared_experts = shared_experts
self.use_overlapped = use_overlapped
self.shared_expert_stream = None
@@ -449,6 +451,16 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
"Sequence parallelism is enabled, shared experts are replicated for best performance."
)
self._gate = gate
@property
def gate(self) -> Optional[torch.nn.Module]:
return self._gate if self.use_overlapped else None
@property
def is_internal_router(self) -> bool:
return False
def forward(
self,
hidden_states: torch.Tensor,

View File

@@ -7,12 +7,17 @@ from vllm.distributed import (get_dp_group, get_ep_group,
tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter)
from vllm.forward_context import get_forward_context
from vllm.utils import direct_register_custom_op
import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.utils import npu_stream_switch, prefetch_stream
from vllm_ascend.utils import (npu_stream_switch, prefetch_stream,
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.utils import direct_register_custom_op
else:
from vllm.utils.torch_utils import direct_register_custom_op
def _maybe_all_gather_and_maybe_unpad_impl(

View File

@@ -3,9 +3,16 @@ import vllm.model_executor.models.config
from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.config import MambaModelConfig
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
from vllm.utils import cdiv
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
else:
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@classmethod
def verify_and_update_config(cls, vllm_config) -> None:

View File

@@ -8,13 +8,21 @@ import vllm.v1.executor.multiproc_executor
from vllm import envs
from vllm.config import VllmConfig
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
get_mp_context, get_open_port)
from vllm.utils import get_mp_context
from vllm.v1.executor.abstract import FailureCallback
from vllm.v1.executor.multiproc_executor import (
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
set_multiprocessing_worker_envs)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
get_open_port)
else:
from vllm.utils.network_utils import (get_distributed_init_method,
get_loopback_ip, get_open_port)
class AscendMultiprocExecutor(MultiprocExecutor):
supports_pp: bool = True

View File

@@ -3,7 +3,13 @@ from torch.nn.parameter import Parameter
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.utils import set_weight_attrs
from vllm.utils import GiB_bytes
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
logger = init_logger(__name__)

View File

@@ -24,6 +24,9 @@ import vllm.envs as envs_vllm
from vllm.logger import logger
from vllm.platforms import Platform, PlatformEnum
# todo: please remove it when solve cuda hard code in vllm
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "True"
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
init_ascend_config)
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
@@ -142,7 +145,6 @@ class NPUPlatform(Platform):
if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
not scheduler_config.send_delta_data and \
scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True
chunked_prefill_enabled_in_ascend_scheduler = getattr(

View File

@@ -9,8 +9,8 @@ from vllm.config import (CUDAGraphMode, VllmConfig,
from vllm.forward_context import BatchDescriptor, get_forward_context
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model_loader
from vllm.model_executor.model_loader.utils import (
process_weights_after_loading, set_default_torch_dtype)
from vllm.model_executor.model_loader.utils import \
process_weights_after_loading
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata
@@ -24,7 +24,13 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
TorchairDeepSeekMTP
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
TorchairCommonAttentionMetadata)
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
else:
from vllm.utils.torch_utils import set_default_torch_dtype
PADDING_SLOT_ID = -1

View File

@@ -72,8 +72,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
get_dtype_size, is_pin_memory_available)
from vllm.utils import cdiv, is_pin_memory_available
from vllm.utils.jsontree import json_map_leaves
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import (
@@ -145,6 +144,13 @@ if prefill_context_parallel_enable():
get_prefill_context_model_parallel_rank,
get_prefill_context_model_parallel_world_size)
if vllm_version_is("0.11.0"):
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
get_dtype_size)
else:
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
# yapf: enable
if vllm_version_is("0.11.0"):

View File

@@ -44,7 +44,7 @@ from vllm_ascend.worker.block_table import MultiGroupBlockTable
if vllm_version_is("0.11.0"):
from vllm.utils import swap_dict_values
else:
from vllm.utils.collections import swap_dict_values
from vllm.utils.collection_utils import swap_dict_values
@dataclass

View File

@@ -35,7 +35,6 @@ from vllm.logger import logger
from vllm.lora.request import LoRARequest
from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
@@ -51,7 +50,7 @@ from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (init_ascend_soc_version,
prefill_context_parallel_enable,
register_ascend_customop, sleep_mode_enabled,
try_register_lib)
try_register_lib, vllm_version_is)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
@@ -66,6 +65,12 @@ torch_non_c_binding_in_graph_functions_npu[
torch._dynamo.trace_rules.torch_name_rule_map.append(
torch_non_c_binding_in_graph_functions_npu) # noqa: E402
if vllm_version_is("0.11.0"):
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
class NPUWorker(WorkerBase):