Upgrade to new vllm commit (#3719)

### What this PR does / why we need it?
Upgrade to new vllm commit:
c9461e05a4

- Fix many imports, caused by
https://github.com/vllm-project/vllm/pull/26908
- Fix import ```sha256```, caused by
https://github.com/vllm-project/vllm/pull/27169
- Remove ```SchedulerConfig.send_delta_data```, caused by
https://github.com/vllm-project/vllm/pull/27142
- Fix ```FusedMoE``` because of dual stream execution, caused by
https://github.com/vllm-project/vllm/pull/26440

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with new added/existing test.


- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Icey
2025-10-25 15:36:32 +08:00
committed by GitHub
parent 226f832c0b
commit d9cdc65854
37 changed files with 229 additions and 71 deletions

View File

@@ -36,7 +36,7 @@ jobs:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=17c540a993af88204ad1b78345c8a865cf58ce44 VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -42,7 +42,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/pre-commit.yml uses: ./.github/workflows/pre-commit.yml
with: with:
vllm: 17c540a993af88204ad1b78345c8a865cf58ce44 vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
changes: changes:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0] vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -140,7 +140,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0] vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -69,7 +69,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0] vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -63,7 +63,11 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel) destroy_distributed_environment, destroy_model_parallel)
from vllm.utils import get_open_port from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@@ -66,8 +66,14 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group) destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from vllm.utils import get_open_port, GiB_bytes
from safetensors.torch import load_file from safetensors.torch import load_file
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes, get_open_port
else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_open_port
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@@ -20,7 +20,11 @@ import os
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.utils import GiB_bytes from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@@ -66,8 +66,14 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group) destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from vllm.utils import get_open_port, GiB_bytes
from safetensors.torch import load_file from safetensors.torch import load_file
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes, get_open_port
else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_open_port
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@@ -45,7 +45,6 @@ from vllm.inputs import TextPrompt
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.utils import maybe_model_redirect from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils import get_open_port
from tests.e2e.model_utils import (TokensTextLogprobs, from tests.e2e.model_utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs) TokensTextLogprobsPromptLogprobs)
@@ -55,6 +54,12 @@ from vllm_ascend.ascend_config import clear_ascend_config
# we not explicitly patch here, some of them might be effectiveless # we not explicitly patch here, some of them might be effectiveless
# in pytest scenario # in pytest scenario
from vllm_ascend.utils import adapt_patch # noqa E402 from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
adapt_patch(True) adapt_patch(True)
adapt_patch(False) adapt_patch(False)

View File

@@ -19,9 +19,14 @@ from typing import Any
import openai import openai
import pytest import pytest
from vllm.utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer from tests.e2e.conftest import RemoteOpenAIServer
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
MODELS = [ MODELS = [
"Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-30B-A3B",

View File

@@ -18,10 +18,15 @@ from typing import Any
import openai import openai
import pytest import pytest
from vllm.utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases from tools.aisbench import run_aisbench_cases
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
MODELS = [ MODELS = [
"Qwen/Qwen3-32B", "Qwen/Qwen3-32B",

View File

@@ -21,11 +21,16 @@ import gc
import torch import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.utils import GiB_bytes
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
from tests.e2e.utils import fork_new_process_for_each_test from tests.e2e.utils import fork_new_process_for_each_test
from vllm_ascend.device_allocator.camem import CaMemAllocator from vllm_ascend.device_allocator.camem import CaMemAllocator
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
@fork_new_process_for_each_test @fork_new_process_for_each_test

View File

@@ -78,21 +78,6 @@ class TestAscendSchedulerConfig(TestBase):
str(context.exception), str(context.exception),
) )
def test_not_implemented_send_delta_data(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
send_delta_data=True,
max_num_batched_tokens=2048,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler doesn't support send_delta_data",
str(context.exception),
)
def test_no_override(self): def test_no_override(self):
ascend_config = AscendSchedulerConfig.initialize_from_config( ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {}) self.basic_scheduler_config, {})

View File

@@ -9,7 +9,6 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
from vllm.multimodal.inputs import (MultiModalFeatureSpec, from vllm.multimodal.inputs import (MultiModalFeatureSpec,
MultiModalKwargsItem, PlaceholderRange) MultiModalKwargsItem, PlaceholderRange)
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash) init_none_hash)
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
@@ -24,6 +23,11 @@ from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B" MODEL = "Qwen3-0.6B"
ENABLE_PREFIX_CACHING = None ENABLE_PREFIX_CACHING = None

View File

@@ -12,7 +12,13 @@ from unittest.mock import MagicMock, patch
import msgspec import msgspec
import zmq import zmq
from vllm.utils import make_zmq_path
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import make_zmq_path
else:
from vllm.utils.network_utils import make_zmq_path
fake_engine = types.ModuleType("mooncake.engine") fake_engine = types.ModuleType("mooncake.engine")
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined] fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]

View File

@@ -10,7 +10,6 @@ import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig) ModelConfig, SchedulerConfig, VllmConfig)
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash) init_none_hash)
from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.core.sched.scheduler import Scheduler
@@ -22,6 +21,11 @@ from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
os.environ["VLLM_USE_V1"] = "1" os.environ["VLLM_USE_V1"] = "1"

View File

@@ -22,6 +22,7 @@ import torch
from torch import nn from torch import nn
from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
from vllm_ascend.utils import vllm_version_is
class DummyDeviceConfig: class DummyDeviceConfig:
@@ -173,7 +174,11 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path):
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading", "vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
lambda *a, **k: None) lambda *a, **k: None)
# patch get_ip # patch get_ip
if vllm_version_is("0.11.0"):
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1") monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
else:
monkeypatch.setattr("vllm.utils.network_utils.get_ip",
lambda: "127.0.0.1")
# patch find_free_port # patch find_free_port
monkeypatch.setattr( monkeypatch.setattr(
"vllm_ascend.model_loader.netloader.netloader.find_free_port", "vllm_ascend.model_loader.netloader.netloader.find_free_port",

View File

@@ -20,14 +20,19 @@ import numpy as np
import pytest import pytest
import torch import torch
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if vllm_version_is("0.11.0"):
from vllm.utils import make_tensor_with_pad
else:
from vllm.utils.torch_utils import make_tensor_with_pad
VOCAB_SIZE = 1024 VOCAB_SIZE = 1024
NUM_OUTPUT_TOKENS = 20 NUM_OUTPUT_TOKENS = 20
MAX_PROMPT_SIZE = 100 MAX_PROMPT_SIZE = 100

View File

@@ -5,6 +5,7 @@ import torch
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is
class TestNPUWorker(TestBase): class TestNPUWorker(TestBase):
@@ -178,6 +179,7 @@ class TestNPUWorker(TestBase):
# Create NPUWorker instance # Create NPUWorker instance
from vllm_ascend.worker.worker_v1 import NPUWorker from vllm_ascend.worker.worker_v1 import NPUWorker
if vllm_version_is("0.11.0"):
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE", with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}): {"float32": torch.float32}):
worker = NPUWorker( worker = NPUWorker(
@@ -187,6 +189,16 @@ class TestNPUWorker(TestBase):
distributed_init_method=self.distributed_init_method, distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker, is_driver_worker=self.is_driver_worker,
) )
else:
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
# Verify cache_dtype is set to custom value # Verify cache_dtype is set to custom value
self.assertEqual(worker.cache_dtype, torch.float32) self.assertEqual(worker.cache_dtype, torch.float32)

View File

@@ -99,9 +99,6 @@ class AscendSchedulerConfig(SchedulerConfig):
raise NotImplementedError( raise NotImplementedError(
f"currently AscendScheduler only supports fcfs policy, got {self.policy}" f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
) )
if self.send_delta_data:
raise NotImplementedError(
"currently AscendScheduler doesn't support send_delta_data.")
if getattr(self, "scheduler_delay_factor", 0) > 0: if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError( raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor." "currently AscendScheduler doesn't support scheduler_delay_factor."

View File

@@ -9,11 +9,18 @@ import torch
import vllm.envs as envs import vllm.envs as envs
import zmq import zmq
from vllm.config import KVTransferConfig, VllmConfig from vllm.config import KVTransferConfig, VllmConfig
from vllm.utils import get_dtype_size, logger, make_zmq_socket from vllm.utils import logger
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \ from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
CPUKVCacheManager CPUKVCacheManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_dtype_size, make_zmq_socket
else:
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
@dataclass @dataclass

View File

@@ -25,19 +25,25 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group, from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
get_world_group) get_world_group)
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.utils import get_ip, logger from vllm.utils import logger
from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version, from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
prefill_context_parallel_enable) prefill_context_parallel_enable,
vllm_version_is)
if prefill_context_parallel_enable(): if prefill_context_parallel_enable():
from vllm.distributed.parallel_state import \ from vllm.distributed.parallel_state import \
get_prefill_context_model_parallel_rank get_prefill_context_model_parallel_rank
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip
TORCH_DTYPE_TO_NPU_DTYPE = { TORCH_DTYPE_TO_NPU_DTYPE = {
torch.half: llm_datadist.DataType.DT_FLOAT16, torch.half: llm_datadist.DataType.DT_FLOAT16,
torch.float16: llm_datadist.DataType.DT_FLOAT16, torch.float16: llm_datadist.DataType.DT_FLOAT16,

View File

@@ -7,7 +7,7 @@ from typing import Generator, List, Optional, Union
# Third Party # Third Party
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.utils import get_kv_cache_torch_dtype, logger from vllm.utils import logger
from vllm_ascend.distributed.mooncake.config_data import ( from vllm_ascend.distributed.mooncake.config_data import (
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata, ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
@@ -16,6 +16,12 @@ from vllm_ascend.distributed.mooncake.kv_transfer import (
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread, KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread) KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_kv_cache_torch_dtype
else:
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
class MooncakeEngine: class MooncakeEngine:

View File

@@ -26,13 +26,19 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group) get_tp_group)
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket from vllm.utils import logger
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import RequestStatus from vllm.v1.request import RequestStatus
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
else:
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata

View File

@@ -26,7 +26,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group, get_world_group) get_tp_group, get_world_group)
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket from vllm.utils import logger
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import RequestStatus from vllm.v1.request import RequestStatus
@@ -34,6 +34,12 @@ import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.utils import (align_memory, from vllm_ascend.distributed.utils import (align_memory,
kv_alltoall_and_rearrange) kv_alltoall_and_rearrange)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
else:
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata

View File

@@ -28,12 +28,19 @@ from vllm.model_executor.model_loader import register_model_loader
from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.base_loader import BaseModelLoader
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
from vllm.model_executor.model_loader.utils import ( from vllm.model_executor.model_loader.utils import (
initialize_model, process_weights_after_loading, set_default_torch_dtype) initialize_model, process_weights_after_loading)
from vllm_ascend.utils import vllm_version_is
from .interaction.elastic import ElasticServer from .interaction.elastic import ElasticServer
from .load import elastic_load from .load import elastic_load
from .utils import find_free_port, is_valid_path_prefix from .utils import find_free_port, is_valid_path_prefix
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
else:
from vllm.utils.torch_utils import set_default_torch_dtype
@register_model_loader("netloader") @register_model_loader("netloader")
class ModelNetLoaderElastic(BaseModelLoader): class ModelNetLoaderElastic(BaseModelLoader):
@@ -200,7 +207,10 @@ class ModelNetLoaderElastic(BaseModelLoader):
if model is not None and ( if model is not None and (
(self.listen_port and self.listen_port in range(1024, 65535)) or (self.listen_port and self.listen_port in range(1024, 65535)) or
(self.listen_port is None)): (self.listen_port is None)):
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip
driver_ip = get_ip() driver_ip = get_ip()
if driver_ip == '0.0.0.0': if driver_ip == '0.0.0.0':

View File

@@ -29,7 +29,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.mla import MLAModules from vllm.model_executor.layers.mla import MLAModules
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
@@ -38,9 +37,11 @@ if vllm_version_is("0.11.0"):
from vllm.attention import Attention from vllm.attention import Attention
from vllm.model_executor.layers.mla import \ from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
from vllm.utils import direct_register_custom_op
else: else:
from vllm.attention.layer import MLAAttention from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
from vllm.utils.torch_utils import direct_register_custom_op
if vllm_version_is("0.11.0"): if vllm_version_is("0.11.0"):
from vllm.attention import Attention from vllm.attention import Attention

View File

@@ -31,7 +31,6 @@ from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.mla import MLAModules from vllm.model_executor.layers.mla import MLAModules
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils import direct_register_custom_op
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
@@ -40,9 +39,11 @@ if vllm_version_is("0.11.0"):
from vllm.attention import Attention from vllm.attention import Attention
from vllm.model_executor.layers.mla import \ from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
from vllm.utils import direct_register_custom_op
else: else:
from vllm.attention.layer import MLAAttention from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
from vllm.utils.torch_utils import direct_register_custom_op
@dataclass @dataclass

View File

@@ -435,10 +435,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
def __init__( def __init__(
self, self,
shared_experts: torch.nn.Module, shared_experts: torch.nn.Module,
gate: Optional[torch.nn.Module] = None,
use_overlapped: bool = True, use_overlapped: bool = True,
**kwargs, **kwargs,
): ):
AscendFusedMoE.__init__(self, **kwargs) AscendFusedMoE.__init__(self, **kwargs)
self._shared_experts = shared_experts self._shared_experts = shared_experts
self.use_overlapped = use_overlapped self.use_overlapped = use_overlapped
self.shared_expert_stream = None self.shared_expert_stream = None
@@ -449,6 +451,16 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
"Sequence parallelism is enabled, shared experts are replicated for best performance." "Sequence parallelism is enabled, shared experts are replicated for best performance."
) )
self._gate = gate
@property
def gate(self) -> Optional[torch.nn.Module]:
return self._gate if self.use_overlapped else None
@property
def is_internal_router(self) -> bool:
return False
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,

View File

@@ -7,12 +7,17 @@ from vllm.distributed import (get_dp_group, get_ep_group,
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter) tensor_model_parallel_reduce_scatter)
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm.utils import direct_register_custom_op
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.utils import npu_stream_switch, prefetch_stream from vllm_ascend.utils import (npu_stream_switch, prefetch_stream,
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.utils import direct_register_custom_op
else:
from vllm.utils.torch_utils import direct_register_custom_op
def _maybe_all_gather_and_maybe_unpad_impl( def _maybe_all_gather_and_maybe_unpad_impl(

View File

@@ -3,9 +3,16 @@ import vllm.model_executor.models.config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.config import MambaModelConfig from vllm.model_executor.models.config import MambaModelConfig
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv from vllm.utils import cdiv
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
else:
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@classmethod @classmethod
def verify_and_update_config(cls, vllm_config) -> None: def verify_and_update_config(cls, vllm_config) -> None:

View File

@@ -8,13 +8,21 @@ import vllm.v1.executor.multiproc_executor
from vllm import envs from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.utils import (get_distributed_init_method, get_loopback_ip, from vllm.utils import get_mp_context
get_mp_context, get_open_port)
from vllm.v1.executor.abstract import FailureCallback from vllm.v1.executor.abstract import FailureCallback
from vllm.v1.executor.multiproc_executor import ( from vllm.v1.executor.multiproc_executor import (
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc, MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
set_multiprocessing_worker_envs) set_multiprocessing_worker_envs)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
get_open_port)
else:
from vllm.utils.network_utils import (get_distributed_init_method,
get_loopback_ip, get_open_port)
class AscendMultiprocExecutor(MultiprocExecutor): class AscendMultiprocExecutor(MultiprocExecutor):
supports_pp: bool = True supports_pp: bool = True

View File

@@ -3,7 +3,13 @@ from torch.nn.parameter import Parameter
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.utils import GiB_bytes
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@@ -24,6 +24,9 @@ import vllm.envs as envs_vllm
from vllm.logger import logger from vllm.logger import logger
from vllm.platforms import Platform, PlatformEnum from vllm.platforms import Platform, PlatformEnum
# todo: please remove it when solve cuda hard code in vllm
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "True"
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config, from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
init_ascend_config) init_ascend_config)
from vllm_ascend.torchair.utils import (check_torchair_cache_exist, from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
@@ -142,7 +145,6 @@ class NPUPlatform(Platform):
if not model_config.is_multimodal_model and \ if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \ structured_outputs_config.backend == "auto" and \
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
not scheduler_config.send_delta_data and \
scheduler_config.policy == "fcfs": scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True ascend_scheduler_config.enabled = True
chunked_prefill_enabled_in_ascend_scheduler = getattr( chunked_prefill_enabled_in_ascend_scheduler = getattr(

View File

@@ -9,8 +9,8 @@ from vllm.config import (CUDAGraphMode, VllmConfig,
from vllm.forward_context import BatchDescriptor, get_forward_context from vllm.forward_context import BatchDescriptor, get_forward_context
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader import get_model_loader
from vllm.model_executor.model_loader.utils import ( from vllm.model_executor.model_loader.utils import \
process_weights_after_loading, set_default_torch_dtype) process_weights_after_loading
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
@@ -24,7 +24,13 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
TorchairDeepSeekMTP TorchairDeepSeekMTP
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
TorchairCommonAttentionMetadata) TorchairCommonAttentionMetadata)
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
else:
from vllm.utils.torch_utils import set_default_torch_dtype
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1

View File

@@ -72,8 +72,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingType from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, from vllm.utils import cdiv, is_pin_memory_available
get_dtype_size, is_pin_memory_available)
from vllm.utils.jsontree import json_map_leaves from vllm.utils.jsontree import json_map_leaves
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
@@ -145,6 +144,13 @@ if prefill_context_parallel_enable():
get_prefill_context_model_parallel_rank, get_prefill_context_model_parallel_rank,
get_prefill_context_model_parallel_world_size) get_prefill_context_model_parallel_world_size)
if vllm_version_is("0.11.0"):
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
get_dtype_size)
else:
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
# yapf: enable # yapf: enable
if vllm_version_is("0.11.0"): if vllm_version_is("0.11.0"):

View File

@@ -44,7 +44,7 @@ from vllm_ascend.worker.block_table import MultiGroupBlockTable
if vllm_version_is("0.11.0"): if vllm_version_is("0.11.0"):
from vllm.utils import swap_dict_values from vllm.utils import swap_dict_values
else: else:
from vllm.utils.collections import swap_dict_values from vllm.utils.collection_utils import swap_dict_values
@dataclass @dataclass

View File

@@ -35,7 +35,6 @@ from vllm.logger import logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
@@ -51,7 +50,7 @@ from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (init_ascend_soc_version, from vllm_ascend.utils import (init_ascend_soc_version,
prefill_context_parallel_enable, prefill_context_parallel_enable,
register_ascend_customop, sleep_mode_enabled, register_ascend_customop, sleep_mode_enabled,
try_register_lib) try_register_lib, vllm_version_is)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
@@ -66,6 +65,12 @@ torch_non_c_binding_in_graph_functions_npu[
torch._dynamo.trace_rules.torch_name_rule_map.append( torch._dynamo.trace_rules.torch_name_rule_map.append(
torch_non_c_binding_in_graph_functions_npu) # noqa: E402 torch_non_c_binding_in_graph_functions_npu) # noqa: E402
if vllm_version_is("0.11.0"):
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
class NPUWorker(WorkerBase): class NPUWorker(WorkerBase):