diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index bf77ae37..2d8a729d 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=17c540a993af88204ad1b78345c8a865cf58ce44 + VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index eee83d26..cc504b8e 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: 17c540a993af88204ad1b78345c8a865cf58ce44 + vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0] + vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] steps: - name: Install packages run: | @@ -140,7 +140,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0] + vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index 461bae45..a821263f 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -69,7 +69,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0] + vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py index 63e0bf9f..62ef99bf 100644 --- a/examples/offline_data_parallel.py +++ b/examples/offline_data_parallel.py @@ -63,7 +63,11 @@ import torch from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( # noqa E402 destroy_distributed_environment, destroy_model_parallel) -from vllm.utils import get_open_port +from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.11.0"): + from vllm.utils import get_open_port +else: + from vllm.utils.network_utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_external_launcher.py b/examples/offline_external_launcher.py index 17f844b3..35d5fcfd 100644 --- a/examples/offline_external_launcher.py +++ b/examples/offline_external_launcher.py @@ -65,9 +65,15 @@ from time import sleep import torch from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( # noqa E402 - destroy_distributed_environment, destroy_model_parallel, get_tp_group) -from vllm.utils import get_open_port, GiB_bytes + destroy_distributed_environment, destroy_model_parallel, get_tp_group) from safetensors.torch import load_file +from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.11.0"): + from vllm.utils import GiB_bytes, get_open_port + +else: + from vllm.utils.mem_constants import GiB_bytes + from vllm.utils.network_utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py index 5ffcff6f..53c59357 100644 --- a/examples/offline_inference_sleep_mode_npu.py +++ b/examples/offline_inference_sleep_mode_npu.py @@ -20,7 +20,11 @@ import os import torch from vllm import LLM, SamplingParams -from vllm.utils import GiB_bytes +from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.11.0"): + from vllm.utils import GiB_bytes +else: + from vllm.utils.mem_constants import GiB_bytes os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_weight_load.py b/examples/offline_weight_load.py index a08ed2d2..c24ac3f5 100644 --- a/examples/offline_weight_load.py +++ b/examples/offline_weight_load.py @@ -66,8 +66,14 @@ import torch from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( # noqa E402 destroy_distributed_environment, destroy_model_parallel, get_tp_group) -from vllm.utils import get_open_port, GiB_bytes from safetensors.torch import load_file +from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.11.0"): + from vllm.utils import GiB_bytes, get_open_port + +else: + from vllm.utils.mem_constants import GiB_bytes + from vllm.utils.network_utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index da715724..6940f4bf 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -45,7 +45,6 @@ from vllm.inputs import TextPrompt from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils import get_open_port from tests.e2e.model_utils import (TokensTextLogprobs, TokensTextLogprobsPromptLogprobs) @@ -55,6 +54,12 @@ from vllm_ascend.ascend_config import clear_ascend_config # we not explicitly patch here, some of them might be effectiveless # in pytest scenario from vllm_ascend.utils import adapt_patch # noqa E402 +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_open_port +else: + from vllm.utils.network_utils import get_open_port adapt_patch(True) adapt_patch(False) diff --git a/tests/e2e/multicard/test_single_request_aclgraph.py b/tests/e2e/multicard/test_single_request_aclgraph.py index 1a0e6f93..f7ef5d3e 100644 --- a/tests/e2e/multicard/test_single_request_aclgraph.py +++ b/tests/e2e/multicard/test_single_request_aclgraph.py @@ -19,9 +19,14 @@ from typing import Any import openai import pytest -from vllm.utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_open_port +else: + from vllm.utils.network_utils import get_open_port MODELS = [ "Qwen/Qwen3-30B-A3B", diff --git a/tests/e2e/nightly/models/test_qwen3_32b.py b/tests/e2e/nightly/models/test_qwen3_32b.py index 15161728..267d56f9 100644 --- a/tests/e2e/nightly/models/test_qwen3_32b.py +++ b/tests/e2e/nightly/models/test_qwen3_32b.py @@ -18,10 +18,15 @@ from typing import Any import openai import pytest -from vllm.utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_open_port +else: + from vllm.utils.network_utils import get_open_port MODELS = [ "Qwen/Qwen3-32B", diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py index 3f1f92b7..04643c80 100644 --- a/tests/e2e/singlecard/test_camem.py +++ b/tests/e2e/singlecard/test_camem.py @@ -21,11 +21,16 @@ import gc import torch from vllm import SamplingParams -from vllm.utils import GiB_bytes from tests.e2e.conftest import VllmRunner from tests.e2e.utils import fork_new_process_for_each_test from vllm_ascend.device_allocator.camem import CaMemAllocator +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import GiB_bytes +else: + from vllm.utils.mem_constants import GiB_bytes @fork_new_process_for_each_test diff --git a/tests/ut/core/test_schedule_config.py b/tests/ut/core/test_schedule_config.py index 84fd6430..16d06be7 100644 --- a/tests/ut/core/test_schedule_config.py +++ b/tests/ut/core/test_schedule_config.py @@ -78,21 +78,6 @@ class TestAscendSchedulerConfig(TestBase): str(context.exception), ) - def test_not_implemented_send_delta_data(self): - with self.assertRaises(NotImplementedError) as context: - AscendSchedulerConfig.initialize_from_config( - self.basic_scheduler_config, - AscendSchedulerConfig( - send_delta_data=True, - max_num_batched_tokens=2048, - max_model_len=2048, - ), - ) - self.assertIn( - "currently AscendScheduler doesn't support send_delta_data", - str(context.exception), - ) - def test_no_override(self): ascend_config = AscendSchedulerConfig.initialize_from_config( self.basic_scheduler_config, {}) diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py index 7f0108a1..ac8bff8a 100644 --- a/tests/ut/core/test_scheduler.py +++ b/tests/ut/core/test_scheduler.py @@ -9,7 +9,6 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams -from vllm.utils import sha256 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.output import SchedulerOutput @@ -24,6 +23,11 @@ from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.11.0"): + from vllm.utils import sha256 +else: + from vllm.utils.hashing import sha256 + EOS_TOKEN_ID = 50256 MODEL = "Qwen3-0.6B" ENABLE_PREFIX_CACHING = None diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py index 6c6c609d..0d801df5 100644 --- a/tests/ut/kv_connector/test_mooncake_connector.py +++ b/tests/ut/kv_connector/test_mooncake_connector.py @@ -12,7 +12,13 @@ from unittest.mock import MagicMock, patch import msgspec import zmq -from vllm.utils import make_zmq_path + +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import make_zmq_path +else: + from vllm.utils.network_utils import make_zmq_path fake_engine = types.ModuleType("mooncake.engine") fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined] diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 68e49e7d..8c25ded6 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -10,7 +10,6 @@ import torch from vllm import SamplingParams from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, ModelConfig, SchedulerConfig, VllmConfig) -from vllm.utils import sha256 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.scheduler import Scheduler @@ -22,6 +21,11 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.11.0"): + from vllm.utils import sha256 +else: + from vllm.utils.hashing import sha256 + EOS_TOKEN_ID = 50256 os.environ["VLLM_USE_V1"] = "1" diff --git a/tests/ut/model_loader/netloader/test_netloader.py b/tests/ut/model_loader/netloader/test_netloader.py index 6658823a..9e0fd18c 100644 --- a/tests/ut/model_loader/netloader/test_netloader.py +++ b/tests/ut/model_loader/netloader/test_netloader.py @@ -22,6 +22,7 @@ import torch from torch import nn from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic +from vllm_ascend.utils import vllm_version_is class DummyDeviceConfig: @@ -173,7 +174,11 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path): "vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading", lambda *a, **k: None) # patch get_ip - monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1") + if vllm_version_is("0.11.0"): + monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1") + else: + monkeypatch.setattr("vllm.utils.network_utils.get_ip", + lambda: "127.0.0.1") # patch find_free_port monkeypatch.setattr( "vllm_ascend.model_loader.netloader.netloader.find_free_port", diff --git a/tests/ut/worker/test_input_batch.py b/tests/ut/worker/test_input_batch.py index 703098d2..cdff8e07 100644 --- a/tests/ut/worker/test_input_batch.py +++ b/tests/ut/worker/test_input_batch.py @@ -20,14 +20,19 @@ import numpy as np import pytest import torch from vllm.sampling_params import SamplingParams -from vllm.utils import make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata +from vllm_ascend.utils import vllm_version_is from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch +if vllm_version_is("0.11.0"): + from vllm.utils import make_tensor_with_pad +else: + from vllm.utils.torch_utils import make_tensor_with_pad + VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 MAX_PROMPT_SIZE = 100 diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index f77e0563..1ead0c57 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -5,6 +5,7 @@ import torch from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from tests.ut.base import TestBase +from vllm_ascend.utils import vllm_version_is class TestNPUWorker(TestBase): @@ -178,15 +179,26 @@ class TestNPUWorker(TestBase): # Create NPUWorker instance from vllm_ascend.worker.worker_v1 import NPUWorker - with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE", - {"float32": torch.float32}): - worker = NPUWorker( - vllm_config=self.vllm_config_mock, - local_rank=self.local_rank, - rank=self.rank, - distributed_init_method=self.distributed_init_method, - is_driver_worker=self.is_driver_worker, - ) + if vllm_version_is("0.11.0"): + with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE", + {"float32": torch.float32}): + worker = NPUWorker( + vllm_config=self.vllm_config_mock, + local_rank=self.local_rank, + rank=self.rank, + distributed_init_method=self.distributed_init_method, + is_driver_worker=self.is_driver_worker, + ) + else: + with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE", + {"float32": torch.float32}): + worker = NPUWorker( + vllm_config=self.vllm_config_mock, + local_rank=self.local_rank, + rank=self.rank, + distributed_init_method=self.distributed_init_method, + is_driver_worker=self.is_driver_worker, + ) # Verify cache_dtype is set to custom value self.assertEqual(worker.cache_dtype, torch.float32) diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index a20c97c5..32d63cbc 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -99,9 +99,6 @@ class AscendSchedulerConfig(SchedulerConfig): raise NotImplementedError( f"currently AscendScheduler only supports fcfs policy, got {self.policy}" ) - if self.send_delta_data: - raise NotImplementedError( - "currently AscendScheduler doesn't support send_delta_data.") if getattr(self, "scheduler_delay_factor", 0) > 0: raise NotImplementedError( "currently AscendScheduler doesn't support scheduler_delay_factor." diff --git a/vllm_ascend/distributed/cpu_offload_manager/metadata.py b/vllm_ascend/distributed/cpu_offload_manager/metadata.py index ddfd37c8..7f07a624 100644 --- a/vllm_ascend/distributed/cpu_offload_manager/metadata.py +++ b/vllm_ascend/distributed/cpu_offload_manager/metadata.py @@ -9,11 +9,18 @@ import torch import vllm.envs as envs import zmq from vllm.config import KVTransferConfig, VllmConfig -from vllm.utils import get_dtype_size, logger, make_zmq_socket +from vllm.utils import logger from vllm.v1.kv_cache_interface import AttentionSpec from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \ CPUKVCacheManager +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_dtype_size, make_zmq_socket +else: + from vllm.utils.network_utils import make_zmq_socket + from vllm.utils.torch_utils import get_dtype_size @dataclass diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py index 61bbc1cf..e72f4eba 100644 --- a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +++ b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py @@ -25,19 +25,25 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group, get_world_group) from vllm.forward_context import ForwardContext -from vllm.utils import get_ip, logger +from vllm.utils import logger from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import Request, RequestStatus import vllm_ascend.envs as envs_ascend from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version, - prefill_context_parallel_enable) + prefill_context_parallel_enable, + vllm_version_is) if prefill_context_parallel_enable(): from vllm.distributed.parallel_state import \ get_prefill_context_model_parallel_rank +if vllm_version_is("0.11.0"): + from vllm.utils import get_ip +else: + from vllm.utils.network_utils import get_ip + TORCH_DTYPE_TO_NPU_DTYPE = { torch.half: llm_datadist.DataType.DT_FLOAT16, torch.float16: llm_datadist.DataType.DT_FLOAT16, diff --git a/vllm_ascend/distributed/mooncake/mooncake_engine.py b/vllm_ascend/distributed/mooncake/mooncake_engine.py index cc58877b..ac00e22c 100644 --- a/vllm_ascend/distributed/mooncake/mooncake_engine.py +++ b/vllm_ascend/distributed/mooncake/mooncake_engine.py @@ -7,7 +7,7 @@ from typing import Generator, List, Optional, Union # Third Party import torch from vllm.config import VllmConfig -from vllm.utils import get_kv_cache_torch_dtype, logger +from vllm.utils import logger from vllm_ascend.distributed.mooncake.config_data import ( ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata, @@ -16,6 +16,12 @@ from vllm_ascend.distributed.mooncake.kv_transfer import ( KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread, KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread) from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_kv_cache_torch_dtype +else: + from vllm.utils.torch_utils import get_kv_cache_torch_dtype class MooncakeEngine: diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py index 57b4494a..b9e06229 100644 --- a/vllm_ascend/distributed/mooncake_connector.py +++ b/vllm_ascend/distributed/mooncake_connector.py @@ -26,13 +26,19 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, get_tp_group) -from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket +from vllm.utils import logger from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_ip, make_zmq_path, make_zmq_socket +else: + from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index 457c7378..a45aaa17 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -26,7 +26,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, get_tp_group, get_world_group) -from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket +from vllm.utils import logger from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus @@ -34,6 +34,12 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.utils import (align_memory, kv_alltoall_and_rearrange) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import get_ip, make_zmq_path, make_zmq_socket +else: + from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata diff --git a/vllm_ascend/model_loader/netloader/netloader.py b/vllm_ascend/model_loader/netloader/netloader.py index 9c2d8307..d613d2a7 100644 --- a/vllm_ascend/model_loader/netloader/netloader.py +++ b/vllm_ascend/model_loader/netloader/netloader.py @@ -28,12 +28,19 @@ from vllm.model_executor.model_loader import register_model_loader from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.default_loader import DefaultModelLoader from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) + initialize_model, process_weights_after_loading) + +from vllm_ascend.utils import vllm_version_is from .interaction.elastic import ElasticServer from .load import elastic_load from .utils import find_free_port, is_valid_path_prefix +if vllm_version_is("0.11.0"): + from vllm.model_executor.model_loader.utils import set_default_torch_dtype +else: + from vllm.utils.torch_utils import set_default_torch_dtype + @register_model_loader("netloader") class ModelNetLoaderElastic(BaseModelLoader): @@ -200,7 +207,10 @@ class ModelNetLoaderElastic(BaseModelLoader): if model is not None and ( (self.listen_port and self.listen_port in range(1024, 65535)) or (self.listen_port is None)): - from vllm.utils import get_ip + if vllm_version_is("0.11.0"): + from vllm.utils import get_ip + else: + from vllm.utils.network_utils import get_ip driver_ip = get_ip() if driver_ip == '0.0.0.0': diff --git a/vllm_ascend/models/layers/mla.py b/vllm_ascend/models/layers/mla.py index abd27e5f..f77f4677 100644 --- a/vllm_ascend/models/layers/mla.py +++ b/vllm_ascend/models/layers/mla.py @@ -29,7 +29,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.mla import MLAModules from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.utils import direct_register_custom_op from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.utils import vllm_version_is @@ -38,9 +37,11 @@ if vllm_version_is("0.11.0"): from vllm.attention import Attention from vllm.model_executor.layers.mla import \ MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper + from vllm.utils import direct_register_custom_op else: from vllm.attention.layer import MLAAttention from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper + from vllm.utils.torch_utils import direct_register_custom_op if vllm_version_is("0.11.0"): from vllm.attention import Attention diff --git a/vllm_ascend/models/layers/sfa.py b/vllm_ascend/models/layers/sfa.py index 5c15ebb2..53343716 100644 --- a/vllm_ascend/models/layers/sfa.py +++ b/vllm_ascend/models/layers/sfa.py @@ -31,7 +31,6 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.mla import MLAModules from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.utils import direct_register_custom_op from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.utils import vllm_version_is @@ -40,9 +39,11 @@ if vllm_version_is("0.11.0"): from vllm.attention import Attention from vllm.model_executor.layers.mla import \ MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper + from vllm.utils import direct_register_custom_op else: from vllm.attention.layer import MLAAttention from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper + from vllm.utils.torch_utils import direct_register_custom_op @dataclass diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 4df79481..f206b1eb 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -435,10 +435,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): def __init__( self, shared_experts: torch.nn.Module, + gate: Optional[torch.nn.Module] = None, use_overlapped: bool = True, **kwargs, ): AscendFusedMoE.__init__(self, **kwargs) + self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None @@ -449,6 +451,16 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): "Sequence parallelism is enabled, shared experts are replicated for best performance." ) + self._gate = gate + + @property + def gate(self) -> Optional[torch.nn.Module]: + return self._gate if self.use_overlapped else None + + @property + def is_internal_router(self) -> bool: + return False + def forward( self, hidden_states: torch.Tensor, diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py index 69e220ea..c4b410d4 100644 --- a/vllm_ascend/ops/register_custom_ops.py +++ b/vllm_ascend/ops/register_custom_ops.py @@ -7,12 +7,17 @@ from vllm.distributed import (get_dp_group, get_ep_group, tensor_model_parallel_all_reduce, tensor_model_parallel_reduce_scatter) from vllm.forward_context import get_forward_context -from vllm.utils import direct_register_custom_op import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch -from vllm_ascend.utils import npu_stream_switch, prefetch_stream +from vllm_ascend.utils import (npu_stream_switch, prefetch_stream, + vllm_version_is) + +if vllm_version_is("0.11.0"): + from vllm.utils import direct_register_custom_op +else: + from vllm.utils.torch_utils import direct_register_custom_op def _maybe_all_gather_and_maybe_unpad_impl( diff --git a/vllm_ascend/patch/platform/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_mamba_config.py index 1afb9e16..ad083f51 100644 --- a/vllm_ascend/patch/platform/patch_mamba_config.py +++ b/vllm_ascend/patch/platform/patch_mamba_config.py @@ -3,9 +3,16 @@ import vllm.model_executor.models.config from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models.config import MambaModelConfig -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv +from vllm.utils import cdiv from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +else: + from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE + @classmethod def verify_and_update_config(cls, vllm_config) -> None: diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index 82b16fc4..525a585b 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -8,13 +8,21 @@ import vllm.v1.executor.multiproc_executor from vllm import envs from vllm.config import VllmConfig from vllm.distributed.device_communicators.shm_broadcast import MessageQueue -from vllm.utils import (get_distributed_init_method, get_loopback_ip, - get_mp_context, get_open_port) +from vllm.utils import get_mp_context from vllm.v1.executor.abstract import FailureCallback from vllm.v1.executor.multiproc_executor import ( MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc, set_multiprocessing_worker_envs) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import (get_distributed_init_method, get_loopback_ip, + get_open_port) +else: + from vllm.utils.network_utils import (get_distributed_init_method, + get_loopback_ip, get_open_port) + class AscendMultiprocExecutor(MultiprocExecutor): supports_pp: bool = True diff --git a/vllm_ascend/patch/worker/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_weight_loader.py index ec3da9d7..cbbace8b 100644 --- a/vllm_ascend/patch/worker/patch_weight_loader.py +++ b/vllm_ascend/patch/worker/patch_weight_loader.py @@ -3,7 +3,13 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import GiB_bytes + +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import GiB_bytes +else: + from vllm.utils.mem_constants import GiB_bytes logger = init_logger(__name__) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index a074e9c8..fa4e802c 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -24,6 +24,9 @@ import vllm.envs as envs_vllm from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum +# todo: please remove it when solve cuda hard code in vllm +os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "True" + from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config, init_ascend_config) from vllm_ascend.torchair.utils import (check_torchair_cache_exist, @@ -142,7 +145,6 @@ class NPUPlatform(Platform): if not model_config.is_multimodal_model and \ structured_outputs_config.backend == "auto" and \ not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ - not scheduler_config.send_delta_data and \ scheduler_config.policy == "fcfs": ascend_scheduler_config.enabled = True chunked_prefill_enabled_in_ascend_scheduler = getattr( diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index ef3925a8..e817cc43 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -9,8 +9,8 @@ from vllm.config import (CUDAGraphMode, VllmConfig, from vllm.forward_context import BatchDescriptor, get_forward_context from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.model_loader import get_model_loader -from vllm.model_executor.model_loader.utils import ( - process_weights_after_loading, set_default_torch_dtype) +from vllm.model_executor.model_loader.utils import \ + process_weights_after_loading from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.sample.metadata import SamplingMetadata @@ -24,7 +24,13 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \ TorchairDeepSeekMTP from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, TorchairCommonAttentionMetadata) -from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable +from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, + vllm_version_is) + +if vllm_version_is("0.11.0"): + from vllm.model_executor.model_loader.utils import set_default_torch_dtype +else: + from vllm.utils.torch_utils import set_default_torch_dtype PADDING_SLOT_ID = -1 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1cc93533..7627c582 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -72,8 +72,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, - get_dtype_size, is_pin_memory_available) +from vllm.utils import cdiv, is_pin_memory_available from vllm.utils.jsontree import json_map_leaves from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( @@ -145,6 +144,13 @@ if prefill_context_parallel_enable(): get_prefill_context_model_parallel_rank, get_prefill_context_model_parallel_world_size) +if vllm_version_is("0.11.0"): + from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, + get_dtype_size) +else: + from vllm.utils.mem_utils import DeviceMemoryProfiler + from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size + # yapf: enable if vllm_version_is("0.11.0"): diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index d2286fb8..51972d0d 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -44,7 +44,7 @@ from vllm_ascend.worker.block_table import MultiGroupBlockTable if vllm_version_is("0.11.0"): from vllm.utils import swap_dict_values else: - from vllm.utils.collections import swap_dict_values + from vllm.utils.collection_utils import swap_dict_values @dataclass diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 021c440d..e8729925 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -35,7 +35,6 @@ from vllm.logger import logger from vllm.lora.request import LoRARequest from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, @@ -51,7 +50,7 @@ from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import (init_ascend_soc_version, prefill_context_parallel_enable, register_ascend_customop, sleep_mode_enabled, - try_register_lib) + try_register_lib, vllm_version_is) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 @@ -66,6 +65,12 @@ torch_non_c_binding_in_graph_functions_npu[ torch._dynamo.trace_rules.torch_name_rule_map.append( torch_non_c_binding_in_graph_functions_npu) # noqa: E402 +if vllm_version_is("0.11.0"): + from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes +else: + from vllm.utils.mem_constants import GiB_bytes + from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE + class NPUWorker(WorkerBase):