Upgrade to new vllm commit (#3719)
### What this PR does / why we need it? Upgrade to new vllm commit:c9461e05a4- Fix many imports, caused by https://github.com/vllm-project/vllm/pull/26908 - Fix import ```sha256```, caused by https://github.com/vllm-project/vllm/pull/27169 - Remove ```SchedulerConfig.send_delta_data```, caused by https://github.com/vllm-project/vllm/pull/27142 - Fix ```FusedMoE``` because of dual stream execution, caused by https://github.com/vllm-project/vllm/pull/26440 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.11.0rc3 - vLLM main:17c540a993--------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -99,9 +99,6 @@ class AscendSchedulerConfig(SchedulerConfig):
|
||||
raise NotImplementedError(
|
||||
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
|
||||
)
|
||||
if self.send_delta_data:
|
||||
raise NotImplementedError(
|
||||
"currently AscendScheduler doesn't support send_delta_data.")
|
||||
if getattr(self, "scheduler_delay_factor", 0) > 0:
|
||||
raise NotImplementedError(
|
||||
"currently AscendScheduler doesn't support scheduler_delay_factor."
|
||||
|
||||
@@ -9,11 +9,18 @@ import torch
|
||||
import vllm.envs as envs
|
||||
import zmq
|
||||
from vllm.config import KVTransferConfig, VllmConfig
|
||||
from vllm.utils import get_dtype_size, logger, make_zmq_socket
|
||||
from vllm.utils import logger
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
|
||||
CPUKVCacheManager
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_dtype_size, make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.utils.torch_utils import get_dtype_size
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -25,19 +25,25 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
|
||||
get_world_group)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.utils import get_ip, logger
|
||||
from vllm.utils import logger
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
|
||||
prefill_context_parallel_enable)
|
||||
prefill_context_parallel_enable,
|
||||
vllm_version_is)
|
||||
|
||||
if prefill_context_parallel_enable():
|
||||
from vllm.distributed.parallel_state import \
|
||||
get_prefill_context_model_parallel_rank
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
TORCH_DTYPE_TO_NPU_DTYPE = {
|
||||
torch.half: llm_datadist.DataType.DT_FLOAT16,
|
||||
torch.float16: llm_datadist.DataType.DT_FLOAT16,
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Generator, List, Optional, Union
|
||||
# Third Party
|
||||
import torch
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.utils import get_kv_cache_torch_dtype, logger
|
||||
from vllm.utils import logger
|
||||
|
||||
from vllm_ascend.distributed.mooncake.config_data import (
|
||||
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
|
||||
@@ -16,6 +16,12 @@ from vllm_ascend.distributed.mooncake.kv_transfer import (
|
||||
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
|
||||
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
|
||||
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_kv_cache_torch_dtype
|
||||
else:
|
||||
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
|
||||
|
||||
|
||||
class MooncakeEngine:
|
||||
|
||||
@@ -26,13 +26,19 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
||||
get_tp_group)
|
||||
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
|
||||
from vllm.utils import logger
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
|
||||
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
|
||||
@@ -26,7 +26,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
||||
get_tp_group, get_world_group)
|
||||
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
|
||||
from vllm.utils import logger
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
@@ -34,6 +34,12 @@ import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.distributed.utils import (align_memory,
|
||||
kv_alltoall_and_rearrange)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
|
||||
@@ -28,12 +28,19 @@ from vllm.model_executor.model_loader import register_model_loader
|
||||
from vllm.model_executor.model_loader.base_loader import BaseModelLoader
|
||||
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
|
||||
from vllm.model_executor.model_loader.utils import (
|
||||
initialize_model, process_weights_after_loading, set_default_torch_dtype)
|
||||
initialize_model, process_weights_after_loading)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
from .interaction.elastic import ElasticServer
|
||||
from .load import elastic_load
|
||||
from .utils import find_free_port, is_valid_path_prefix
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
else:
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
|
||||
@register_model_loader("netloader")
|
||||
class ModelNetLoaderElastic(BaseModelLoader):
|
||||
@@ -200,7 +207,10 @@ class ModelNetLoaderElastic(BaseModelLoader):
|
||||
if model is not None and (
|
||||
(self.listen_port and self.listen_port in range(1024, 65535)) or
|
||||
(self.listen_port is None)):
|
||||
from vllm.utils import get_ip
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip
|
||||
driver_ip = get_ip()
|
||||
|
||||
if driver_ip == '0.0.0.0':
|
||||
|
||||
@@ -29,7 +29,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.model_executor.layers.mla import MLAModules
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
@@ -38,9 +37,11 @@ if vllm_version_is("0.11.0"):
|
||||
from vllm.attention import Attention
|
||||
from vllm.model_executor.layers.mla import \
|
||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||
from vllm.utils import direct_register_custom_op
|
||||
else:
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.attention import Attention
|
||||
|
||||
@@ -31,7 +31,6 @@ from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.layers.mla import MLAModules
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
@@ -40,9 +39,11 @@ if vllm_version_is("0.11.0"):
|
||||
from vllm.attention import Attention
|
||||
from vllm.model_executor.layers.mla import \
|
||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||
from vllm.utils import direct_register_custom_op
|
||||
else:
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -435,10 +435,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
||||
def __init__(
|
||||
self,
|
||||
shared_experts: torch.nn.Module,
|
||||
gate: Optional[torch.nn.Module] = None,
|
||||
use_overlapped: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
AscendFusedMoE.__init__(self, **kwargs)
|
||||
|
||||
self._shared_experts = shared_experts
|
||||
self.use_overlapped = use_overlapped
|
||||
self.shared_expert_stream = None
|
||||
@@ -449,6 +451,16 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
||||
"Sequence parallelism is enabled, shared experts are replicated for best performance."
|
||||
)
|
||||
|
||||
self._gate = gate
|
||||
|
||||
@property
|
||||
def gate(self) -> Optional[torch.nn.Module]:
|
||||
return self._gate if self.use_overlapped else None
|
||||
|
||||
@property
|
||||
def is_internal_router(self) -> bool:
|
||||
return False
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@@ -7,12 +7,17 @@ from vllm.distributed import (get_dp_group, get_ep_group,
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_reduce_scatter)
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
|
||||
from vllm_ascend.utils import npu_stream_switch, prefetch_stream
|
||||
from vllm_ascend.utils import (npu_stream_switch, prefetch_stream,
|
||||
vllm_version_is)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import direct_register_custom_op
|
||||
else:
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
|
||||
def _maybe_all_gather_and_maybe_unpad_impl(
|
||||
|
||||
@@ -3,9 +3,16 @@ import vllm.model_executor.models.config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.model_executor.models.config import MambaModelConfig
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
|
||||
from vllm.utils import cdiv
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
else:
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
@classmethod
|
||||
def verify_and_update_config(cls, vllm_config) -> None:
|
||||
|
||||
@@ -8,13 +8,21 @@ import vllm.v1.executor.multiproc_executor
|
||||
from vllm import envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
|
||||
get_mp_context, get_open_port)
|
||||
from vllm.utils import get_mp_context
|
||||
from vllm.v1.executor.abstract import FailureCallback
|
||||
from vllm.v1.executor.multiproc_executor import (
|
||||
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
|
||||
set_multiprocessing_worker_envs)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
|
||||
get_open_port)
|
||||
else:
|
||||
from vllm.utils.network_utils import (get_distributed_init_method,
|
||||
get_loopback_ip, get_open_port)
|
||||
|
||||
|
||||
class AscendMultiprocExecutor(MultiprocExecutor):
|
||||
supports_pp: bool = True
|
||||
|
||||
@@ -3,7 +3,13 @@ from torch.nn.parameter import Parameter
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import GiB_bytes
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -24,6 +24,9 @@ import vllm.envs as envs_vllm
|
||||
from vllm.logger import logger
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
|
||||
# todo: please remove it when solve cuda hard code in vllm
|
||||
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "True"
|
||||
|
||||
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
|
||||
init_ascend_config)
|
||||
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
||||
@@ -142,7 +145,6 @@ class NPUPlatform(Platform):
|
||||
if not model_config.is_multimodal_model and \
|
||||
structured_outputs_config.backend == "auto" and \
|
||||
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||
not scheduler_config.send_delta_data and \
|
||||
scheduler_config.policy == "fcfs":
|
||||
ascend_scheduler_config.enabled = True
|
||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||
|
||||
@@ -9,8 +9,8 @@ from vllm.config import (CUDAGraphMode, VllmConfig,
|
||||
from vllm.forward_context import BatchDescriptor, get_forward_context
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
from vllm.model_executor.model_loader.utils import (
|
||||
process_weights_after_loading, set_default_torch_dtype)
|
||||
from vllm.model_executor.model_loader.utils import \
|
||||
process_weights_after_loading
|
||||
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
@@ -24,7 +24,13 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
|
||||
TorchairDeepSeekMTP
|
||||
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
||||
TorchairCommonAttentionMetadata)
|
||||
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
||||
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
|
||||
vllm_version_is)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
else:
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
PADDING_SLOT_ID = -1
|
||||
|
||||
|
||||
@@ -72,8 +72,7 @@ from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingType
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
|
||||
get_dtype_size, is_pin_memory_available)
|
||||
from vllm.utils import cdiv, is_pin_memory_available
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
@@ -145,6 +144,13 @@ if prefill_context_parallel_enable():
|
||||
get_prefill_context_model_parallel_rank,
|
||||
get_prefill_context_model_parallel_world_size)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
get_dtype_size)
|
||||
else:
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
|
||||
@@ -44,7 +44,7 @@ from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import swap_dict_values
|
||||
else:
|
||||
from vllm.utils.collections import swap_dict_values
|
||||
from vllm.utils.collection_utils import swap_dict_values
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -35,7 +35,6 @@ from vllm.logger import logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
@@ -51,7 +50,7 @@ from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import (init_ascend_soc_version,
|
||||
prefill_context_parallel_enable,
|
||||
register_ascend_customop, sleep_mode_enabled,
|
||||
try_register_lib)
|
||||
try_register_lib, vllm_version_is)
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
|
||||
@@ -66,6 +65,12 @@ torch_non_c_binding_in_graph_functions_npu[
|
||||
torch._dynamo.trace_rules.torch_name_rule_map.append(
|
||||
torch_non_c_binding_in_graph_functions_npu) # noqa: E402
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
class NPUWorker(WorkerBase):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user