Upgrade to new vllm commit (#3719)
### What this PR does / why we need it? Upgrade to new vllm commit:c9461e05a4- Fix many imports, caused by https://github.com/vllm-project/vllm/pull/26908 - Fix import ```sha256```, caused by https://github.com/vllm-project/vllm/pull/27169 - Remove ```SchedulerConfig.send_delta_data```, caused by https://github.com/vllm-project/vllm/pull/27142 - Fix ```FusedMoE``` because of dual stream execution, caused by https://github.com/vllm-project/vllm/pull/26440 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.11.0rc3 - vLLM main:17c540a993--------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=17c540a993af88204ad1b78345c8a865cf58ce44
|
VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
6
.github/workflows/vllm_ascend_test.yaml
vendored
6
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/pre-commit.yml
|
uses: ./.github/workflows/pre-commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 17c540a993af88204ad1b78345c8a865cf58ce44
|
vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
|
||||||
|
|
||||||
changes:
|
changes:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -83,7 +83,7 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
|
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
|
||||||
steps:
|
steps:
|
||||||
- name: Install packages
|
- name: Install packages
|
||||||
run: |
|
run: |
|
||||||
@@ -140,7 +140,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
|
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -69,7 +69,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [17c540a993af88204ad1b78345c8a865cf58ce44, v0.11.0]
|
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
@@ -63,7 +63,11 @@ import torch
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.distributed.parallel_state import ( # noqa E402
|
from vllm.distributed.parallel_state import ( # noqa E402
|
||||||
destroy_distributed_environment, destroy_model_parallel)
|
destroy_distributed_environment, destroy_model_parallel)
|
||||||
from vllm.utils import get_open_port
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_open_port
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|||||||
@@ -66,8 +66,14 @@ import torch
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.distributed.parallel_state import ( # noqa E402
|
from vllm.distributed.parallel_state import ( # noqa E402
|
||||||
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
||||||
from vllm.utils import get_open_port, GiB_bytes
|
|
||||||
from safetensors.torch import load_file
|
from safetensors.torch import load_file
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import GiB_bytes, get_open_port
|
||||||
|
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|||||||
@@ -20,7 +20,11 @@ import os
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.utils import GiB_bytes
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import GiB_bytes
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
|
|
||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|||||||
@@ -66,8 +66,14 @@ import torch
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.distributed.parallel_state import ( # noqa E402
|
from vllm.distributed.parallel_state import ( # noqa E402
|
||||||
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
||||||
from vllm.utils import get_open_port, GiB_bytes
|
|
||||||
from safetensors.torch import load_file
|
from safetensors.torch import load_file
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import GiB_bytes, get_open_port
|
||||||
|
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|||||||
@@ -45,7 +45,6 @@ from vllm.inputs import TextPrompt
|
|||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||||
from vllm.utils import get_open_port
|
|
||||||
|
|
||||||
from tests.e2e.model_utils import (TokensTextLogprobs,
|
from tests.e2e.model_utils import (TokensTextLogprobs,
|
||||||
TokensTextLogprobsPromptLogprobs)
|
TokensTextLogprobsPromptLogprobs)
|
||||||
@@ -55,6 +54,12 @@ from vllm_ascend.ascend_config import clear_ascend_config
|
|||||||
# we not explicitly patch here, some of them might be effectiveless
|
# we not explicitly patch here, some of them might be effectiveless
|
||||||
# in pytest scenario
|
# in pytest scenario
|
||||||
from vllm_ascend.utils import adapt_patch # noqa E402
|
from vllm_ascend.utils import adapt_patch # noqa E402
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_open_port
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
adapt_patch(True)
|
adapt_patch(True)
|
||||||
adapt_patch(False)
|
adapt_patch(False)
|
||||||
|
|||||||
@@ -19,9 +19,14 @@ from typing import Any
|
|||||||
|
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
from vllm.utils import get_open_port
|
|
||||||
|
|
||||||
from tests.e2e.conftest import RemoteOpenAIServer
|
from tests.e2e.conftest import RemoteOpenAIServer
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_open_port
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"Qwen/Qwen3-30B-A3B",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
|
|||||||
@@ -18,10 +18,15 @@ from typing import Any
|
|||||||
|
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
from vllm.utils import get_open_port
|
|
||||||
|
|
||||||
from tests.e2e.conftest import RemoteOpenAIServer
|
from tests.e2e.conftest import RemoteOpenAIServer
|
||||||
from tools.aisbench import run_aisbench_cases
|
from tools.aisbench import run_aisbench_cases
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_open_port
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"Qwen/Qwen3-32B",
|
"Qwen/Qwen3-32B",
|
||||||
|
|||||||
@@ -21,11 +21,16 @@ import gc
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.utils import GiB_bytes
|
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
from tests.e2e.conftest import VllmRunner
|
||||||
from tests.e2e.utils import fork_new_process_for_each_test
|
from tests.e2e.utils import fork_new_process_for_each_test
|
||||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import GiB_bytes
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
|
|||||||
@@ -78,21 +78,6 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
str(context.exception),
|
str(context.exception),
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_not_implemented_send_delta_data(self):
|
|
||||||
with self.assertRaises(NotImplementedError) as context:
|
|
||||||
AscendSchedulerConfig.initialize_from_config(
|
|
||||||
self.basic_scheduler_config,
|
|
||||||
AscendSchedulerConfig(
|
|
||||||
send_delta_data=True,
|
|
||||||
max_num_batched_tokens=2048,
|
|
||||||
max_model_len=2048,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
self.assertIn(
|
|
||||||
"currently AscendScheduler doesn't support send_delta_data",
|
|
||||||
str(context.exception),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_no_override(self):
|
def test_no_override(self):
|
||||||
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||||
self.basic_scheduler_config, {})
|
self.basic_scheduler_config, {})
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
|||||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||||
MultiModalKwargsItem, PlaceholderRange)
|
MultiModalKwargsItem, PlaceholderRange)
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import sha256
|
|
||||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||||
init_none_hash)
|
init_none_hash)
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
@@ -24,6 +23,11 @@ from vllm_ascend.core.scheduler import AscendScheduler
|
|||||||
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
|
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
|
||||||
from vllm_ascend.utils import vllm_version_is
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import sha256
|
||||||
|
else:
|
||||||
|
from vllm.utils.hashing import sha256
|
||||||
|
|
||||||
EOS_TOKEN_ID = 50256
|
EOS_TOKEN_ID = 50256
|
||||||
MODEL = "Qwen3-0.6B"
|
MODEL = "Qwen3-0.6B"
|
||||||
ENABLE_PREFIX_CACHING = None
|
ENABLE_PREFIX_CACHING = None
|
||||||
|
|||||||
@@ -12,7 +12,13 @@ from unittest.mock import MagicMock, patch
|
|||||||
|
|
||||||
import msgspec
|
import msgspec
|
||||||
import zmq
|
import zmq
|
||||||
from vllm.utils import make_zmq_path
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import make_zmq_path
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import make_zmq_path
|
||||||
|
|
||||||
fake_engine = types.ModuleType("mooncake.engine")
|
fake_engine = types.ModuleType("mooncake.engine")
|
||||||
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]
|
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ import torch
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
||||||
ModelConfig, SchedulerConfig, VllmConfig)
|
ModelConfig, SchedulerConfig, VllmConfig)
|
||||||
from vllm.utils import sha256
|
|
||||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||||
init_none_hash)
|
init_none_hash)
|
||||||
from vllm.v1.core.sched.scheduler import Scheduler
|
from vllm.v1.core.sched.scheduler import Scheduler
|
||||||
@@ -22,6 +21,11 @@ from vllm.v1.structured_output import StructuredOutputManager
|
|||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import sha256
|
||||||
|
else:
|
||||||
|
from vllm.utils.hashing import sha256
|
||||||
|
|
||||||
EOS_TOKEN_ID = 50256
|
EOS_TOKEN_ID = 50256
|
||||||
os.environ["VLLM_USE_V1"] = "1"
|
os.environ["VLLM_USE_V1"] = "1"
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
|
from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
|
||||||
class DummyDeviceConfig:
|
class DummyDeviceConfig:
|
||||||
@@ -173,7 +174,11 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path):
|
|||||||
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
|
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
|
||||||
lambda *a, **k: None)
|
lambda *a, **k: None)
|
||||||
# patch get_ip
|
# patch get_ip
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
|
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
|
||||||
|
else:
|
||||||
|
monkeypatch.setattr("vllm.utils.network_utils.get_ip",
|
||||||
|
lambda: "127.0.0.1")
|
||||||
# patch find_free_port
|
# patch find_free_port
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"vllm_ascend.model_loader.netloader.netloader.find_free_port",
|
"vllm_ascend.model_loader.netloader.netloader.find_free_port",
|
||||||
|
|||||||
@@ -20,14 +20,19 @@ import numpy as np
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import make_tensor_with_pad
|
|
||||||
from vllm.v1.pool.metadata import PoolingMetadata
|
from vllm.v1.pool.metadata import PoolingMetadata
|
||||||
from vllm.v1.sample.logits_processor import LogitsProcessors
|
from vllm.v1.sample.logits_processor import LogitsProcessors
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
|
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
|
||||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import make_tensor_with_pad
|
||||||
|
else:
|
||||||
|
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||||
|
|
||||||
VOCAB_SIZE = 1024
|
VOCAB_SIZE = 1024
|
||||||
NUM_OUTPUT_TOKENS = 20
|
NUM_OUTPUT_TOKENS = 20
|
||||||
MAX_PROMPT_SIZE = 100
|
MAX_PROMPT_SIZE = 100
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import torch
|
|||||||
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
|
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
|
||||||
|
|
||||||
from tests.ut.base import TestBase
|
from tests.ut.base import TestBase
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
|
||||||
class TestNPUWorker(TestBase):
|
class TestNPUWorker(TestBase):
|
||||||
@@ -178,6 +179,7 @@ class TestNPUWorker(TestBase):
|
|||||||
# Create NPUWorker instance
|
# Create NPUWorker instance
|
||||||
from vllm_ascend.worker.worker_v1 import NPUWorker
|
from vllm_ascend.worker.worker_v1 import NPUWorker
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
|
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
|
||||||
{"float32": torch.float32}):
|
{"float32": torch.float32}):
|
||||||
worker = NPUWorker(
|
worker = NPUWorker(
|
||||||
@@ -187,6 +189,16 @@ class TestNPUWorker(TestBase):
|
|||||||
distributed_init_method=self.distributed_init_method,
|
distributed_init_method=self.distributed_init_method,
|
||||||
is_driver_worker=self.is_driver_worker,
|
is_driver_worker=self.is_driver_worker,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
|
||||||
|
{"float32": torch.float32}):
|
||||||
|
worker = NPUWorker(
|
||||||
|
vllm_config=self.vllm_config_mock,
|
||||||
|
local_rank=self.local_rank,
|
||||||
|
rank=self.rank,
|
||||||
|
distributed_init_method=self.distributed_init_method,
|
||||||
|
is_driver_worker=self.is_driver_worker,
|
||||||
|
)
|
||||||
|
|
||||||
# Verify cache_dtype is set to custom value
|
# Verify cache_dtype is set to custom value
|
||||||
self.assertEqual(worker.cache_dtype, torch.float32)
|
self.assertEqual(worker.cache_dtype, torch.float32)
|
||||||
|
|||||||
@@ -99,9 +99,6 @@ class AscendSchedulerConfig(SchedulerConfig):
|
|||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
|
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
|
||||||
)
|
)
|
||||||
if self.send_delta_data:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"currently AscendScheduler doesn't support send_delta_data.")
|
|
||||||
if getattr(self, "scheduler_delay_factor", 0) > 0:
|
if getattr(self, "scheduler_delay_factor", 0) > 0:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"currently AscendScheduler doesn't support scheduler_delay_factor."
|
"currently AscendScheduler doesn't support scheduler_delay_factor."
|
||||||
|
|||||||
@@ -9,11 +9,18 @@ import torch
|
|||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
import zmq
|
import zmq
|
||||||
from vllm.config import KVTransferConfig, VllmConfig
|
from vllm.config import KVTransferConfig, VllmConfig
|
||||||
from vllm.utils import get_dtype_size, logger, make_zmq_socket
|
from vllm.utils import logger
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||||
|
|
||||||
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
|
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
|
||||||
CPUKVCacheManager
|
CPUKVCacheManager
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_dtype_size, make_zmq_socket
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import make_zmq_socket
|
||||||
|
from vllm.utils.torch_utils import get_dtype_size
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -25,19 +25,25 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
|||||||
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
|
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
|
||||||
get_world_group)
|
get_world_group)
|
||||||
from vllm.forward_context import ForwardContext
|
from vllm.forward_context import ForwardContext
|
||||||
from vllm.utils import get_ip, logger
|
from vllm.utils import logger
|
||||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.request import Request, RequestStatus
|
from vllm.v1.request import Request, RequestStatus
|
||||||
|
|
||||||
import vllm_ascend.envs as envs_ascend
|
import vllm_ascend.envs as envs_ascend
|
||||||
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
|
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
|
||||||
prefill_context_parallel_enable)
|
prefill_context_parallel_enable,
|
||||||
|
vllm_version_is)
|
||||||
|
|
||||||
if prefill_context_parallel_enable():
|
if prefill_context_parallel_enable():
|
||||||
from vllm.distributed.parallel_state import \
|
from vllm.distributed.parallel_state import \
|
||||||
get_prefill_context_model_parallel_rank
|
get_prefill_context_model_parallel_rank
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_ip
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_ip
|
||||||
|
|
||||||
TORCH_DTYPE_TO_NPU_DTYPE = {
|
TORCH_DTYPE_TO_NPU_DTYPE = {
|
||||||
torch.half: llm_datadist.DataType.DT_FLOAT16,
|
torch.half: llm_datadist.DataType.DT_FLOAT16,
|
||||||
torch.float16: llm_datadist.DataType.DT_FLOAT16,
|
torch.float16: llm_datadist.DataType.DT_FLOAT16,
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from typing import Generator, List, Optional, Union
|
|||||||
# Third Party
|
# Third Party
|
||||||
import torch
|
import torch
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.utils import get_kv_cache_torch_dtype, logger
|
from vllm.utils import logger
|
||||||
|
|
||||||
from vllm_ascend.distributed.mooncake.config_data import (
|
from vllm_ascend.distributed.mooncake.config_data import (
|
||||||
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
|
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
|
||||||
@@ -16,6 +16,12 @@ from vllm_ascend.distributed.mooncake.kv_transfer import (
|
|||||||
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
|
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
|
||||||
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
|
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
|
||||||
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
|
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_kv_cache_torch_dtype
|
||||||
|
else:
|
||||||
|
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
|
||||||
|
|
||||||
|
|
||||||
class MooncakeEngine:
|
class MooncakeEngine:
|
||||||
|
|||||||
@@ -26,13 +26,19 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
|||||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
||||||
get_tp_group)
|
get_tp_group)
|
||||||
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
|
from vllm.utils import logger
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.request import RequestStatus
|
from vllm.v1.request import RequestStatus
|
||||||
|
|
||||||
import vllm_ascend.envs as envs_ascend
|
import vllm_ascend.envs as envs_ascend
|
||||||
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
|
||||||
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
|
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.attention.backends.abstract import AttentionMetadata
|
from vllm.attention.backends.abstract import AttentionMetadata
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
|||||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
||||||
get_tp_group, get_world_group)
|
get_tp_group, get_world_group)
|
||||||
from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket
|
from vllm.utils import logger
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.request import RequestStatus
|
from vllm.v1.request import RequestStatus
|
||||||
|
|
||||||
@@ -34,6 +34,12 @@ import vllm_ascend.envs as envs_ascend
|
|||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.distributed.utils import (align_memory,
|
from vllm_ascend.distributed.utils import (align_memory,
|
||||||
kv_alltoall_and_rearrange)
|
kv_alltoall_and_rearrange)
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.attention.backends.abstract import AttentionMetadata
|
from vllm.attention.backends.abstract import AttentionMetadata
|
||||||
|
|||||||
@@ -28,12 +28,19 @@ from vllm.model_executor.model_loader import register_model_loader
|
|||||||
from vllm.model_executor.model_loader.base_loader import BaseModelLoader
|
from vllm.model_executor.model_loader.base_loader import BaseModelLoader
|
||||||
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
|
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
|
||||||
from vllm.model_executor.model_loader.utils import (
|
from vllm.model_executor.model_loader.utils import (
|
||||||
initialize_model, process_weights_after_loading, set_default_torch_dtype)
|
initialize_model, process_weights_after_loading)
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
from .interaction.elastic import ElasticServer
|
from .interaction.elastic import ElasticServer
|
||||||
from .load import elastic_load
|
from .load import elastic_load
|
||||||
from .utils import find_free_port, is_valid_path_prefix
|
from .utils import find_free_port, is_valid_path_prefix
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||||
|
else:
|
||||||
|
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||||
|
|
||||||
|
|
||||||
@register_model_loader("netloader")
|
@register_model_loader("netloader")
|
||||||
class ModelNetLoaderElastic(BaseModelLoader):
|
class ModelNetLoaderElastic(BaseModelLoader):
|
||||||
@@ -200,7 +207,10 @@ class ModelNetLoaderElastic(BaseModelLoader):
|
|||||||
if model is not None and (
|
if model is not None and (
|
||||||
(self.listen_port and self.listen_port in range(1024, 65535)) or
|
(self.listen_port and self.listen_port in range(1024, 65535)) or
|
||||||
(self.listen_port is None)):
|
(self.listen_port is None)):
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
from vllm.utils import get_ip
|
from vllm.utils import get_ip
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import get_ip
|
||||||
driver_ip = get_ip()
|
driver_ip = get_ip()
|
||||||
|
|
||||||
if driver_ip == '0.0.0.0':
|
if driver_ip == '0.0.0.0':
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
|
|||||||
from vllm.forward_context import ForwardContext, get_forward_context
|
from vllm.forward_context import ForwardContext, get_forward_context
|
||||||
from vllm.model_executor.layers.mla import MLAModules
|
from vllm.model_executor.layers.mla import MLAModules
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.utils import direct_register_custom_op
|
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.utils import vllm_version_is
|
from vllm_ascend.utils import vllm_version_is
|
||||||
@@ -38,9 +37,11 @@ if vllm_version_is("0.11.0"):
|
|||||||
from vllm.attention import Attention
|
from vllm.attention import Attention
|
||||||
from vllm.model_executor.layers.mla import \
|
from vllm.model_executor.layers.mla import \
|
||||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||||
|
from vllm.utils import direct_register_custom_op
|
||||||
else:
|
else:
|
||||||
from vllm.attention.layer import MLAAttention
|
from vllm.attention.layer import MLAAttention
|
||||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||||
|
from vllm.utils.torch_utils import direct_register_custom_op
|
||||||
|
|
||||||
if vllm_version_is("0.11.0"):
|
if vllm_version_is("0.11.0"):
|
||||||
from vllm.attention import Attention
|
from vllm.attention import Attention
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ from vllm.forward_context import ForwardContext, get_forward_context
|
|||||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||||
from vllm.model_executor.layers.mla import MLAModules
|
from vllm.model_executor.layers.mla import MLAModules
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.utils import direct_register_custom_op
|
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.utils import vllm_version_is
|
from vllm_ascend.utils import vllm_version_is
|
||||||
@@ -40,9 +39,11 @@ if vllm_version_is("0.11.0"):
|
|||||||
from vllm.attention import Attention
|
from vllm.attention import Attention
|
||||||
from vllm.model_executor.layers.mla import \
|
from vllm.model_executor.layers.mla import \
|
||||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||||
|
from vllm.utils import direct_register_custom_op
|
||||||
else:
|
else:
|
||||||
from vllm.attention.layer import MLAAttention
|
from vllm.attention.layer import MLAAttention
|
||||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||||
|
from vllm.utils.torch_utils import direct_register_custom_op
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -435,10 +435,12 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
shared_experts: torch.nn.Module,
|
shared_experts: torch.nn.Module,
|
||||||
|
gate: Optional[torch.nn.Module] = None,
|
||||||
use_overlapped: bool = True,
|
use_overlapped: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
AscendFusedMoE.__init__(self, **kwargs)
|
AscendFusedMoE.__init__(self, **kwargs)
|
||||||
|
|
||||||
self._shared_experts = shared_experts
|
self._shared_experts = shared_experts
|
||||||
self.use_overlapped = use_overlapped
|
self.use_overlapped = use_overlapped
|
||||||
self.shared_expert_stream = None
|
self.shared_expert_stream = None
|
||||||
@@ -449,6 +451,16 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
|||||||
"Sequence parallelism is enabled, shared experts are replicated for best performance."
|
"Sequence parallelism is enabled, shared experts are replicated for best performance."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._gate = gate
|
||||||
|
|
||||||
|
@property
|
||||||
|
def gate(self) -> Optional[torch.nn.Module]:
|
||||||
|
return self._gate if self.use_overlapped else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_internal_router(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
|
|||||||
@@ -7,12 +7,17 @@ from vllm.distributed import (get_dp_group, get_ep_group,
|
|||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
tensor_model_parallel_reduce_scatter)
|
tensor_model_parallel_reduce_scatter)
|
||||||
from vllm.forward_context import get_forward_context
|
from vllm.forward_context import get_forward_context
|
||||||
from vllm.utils import direct_register_custom_op
|
|
||||||
|
|
||||||
import vllm_ascend.envs as envs_ascend
|
import vllm_ascend.envs as envs_ascend
|
||||||
from vllm_ascend.ascend_forward_context import MoECommType
|
from vllm_ascend.ascend_forward_context import MoECommType
|
||||||
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
|
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
|
||||||
from vllm_ascend.utils import npu_stream_switch, prefetch_stream
|
from vllm_ascend.utils import (npu_stream_switch, prefetch_stream,
|
||||||
|
vllm_version_is)
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import direct_register_custom_op
|
||||||
|
else:
|
||||||
|
from vllm.utils.torch_utils import direct_register_custom_op
|
||||||
|
|
||||||
|
|
||||||
def _maybe_all_gather_and_maybe_unpad_impl(
|
def _maybe_all_gather_and_maybe_unpad_impl(
|
||||||
|
|||||||
@@ -3,9 +3,16 @@ import vllm.model_executor.models.config
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.models import ModelRegistry
|
from vllm.model_executor.models import ModelRegistry
|
||||||
from vllm.model_executor.models.config import MambaModelConfig
|
from vllm.model_executor.models.config import MambaModelConfig
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
|
from vllm.utils import cdiv
|
||||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
|
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
else:
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def verify_and_update_config(cls, vllm_config) -> None:
|
def verify_and_update_config(cls, vllm_config) -> None:
|
||||||
|
|||||||
@@ -8,13 +8,21 @@ import vllm.v1.executor.multiproc_executor
|
|||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||||
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
|
from vllm.utils import get_mp_context
|
||||||
get_mp_context, get_open_port)
|
|
||||||
from vllm.v1.executor.abstract import FailureCallback
|
from vllm.v1.executor.abstract import FailureCallback
|
||||||
from vllm.v1.executor.multiproc_executor import (
|
from vllm.v1.executor.multiproc_executor import (
|
||||||
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
|
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
|
||||||
set_multiprocessing_worker_envs)
|
set_multiprocessing_worker_envs)
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
|
||||||
|
get_open_port)
|
||||||
|
else:
|
||||||
|
from vllm.utils.network_utils import (get_distributed_init_method,
|
||||||
|
get_loopback_ip, get_open_port)
|
||||||
|
|
||||||
|
|
||||||
class AscendMultiprocExecutor(MultiprocExecutor):
|
class AscendMultiprocExecutor(MultiprocExecutor):
|
||||||
supports_pp: bool = True
|
supports_pp: bool = True
|
||||||
|
|||||||
@@ -3,7 +3,13 @@ from torch.nn.parameter import Parameter
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.utils import GiB_bytes
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import GiB_bytes
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,9 @@ import vllm.envs as envs_vllm
|
|||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
from vllm.platforms import Platform, PlatformEnum
|
from vllm.platforms import Platform, PlatformEnum
|
||||||
|
|
||||||
|
# todo: please remove it when solve cuda hard code in vllm
|
||||||
|
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "True"
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
|
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
|
||||||
init_ascend_config)
|
init_ascend_config)
|
||||||
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
||||||
@@ -142,7 +145,6 @@ class NPUPlatform(Platform):
|
|||||||
if not model_config.is_multimodal_model and \
|
if not model_config.is_multimodal_model and \
|
||||||
structured_outputs_config.backend == "auto" and \
|
structured_outputs_config.backend == "auto" and \
|
||||||
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||||
not scheduler_config.send_delta_data and \
|
|
||||||
scheduler_config.policy == "fcfs":
|
scheduler_config.policy == "fcfs":
|
||||||
ascend_scheduler_config.enabled = True
|
ascend_scheduler_config.enabled = True
|
||||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||||
|
|||||||
@@ -9,8 +9,8 @@ from vllm.config import (CUDAGraphMode, VllmConfig,
|
|||||||
from vllm.forward_context import BatchDescriptor, get_forward_context
|
from vllm.forward_context import BatchDescriptor, get_forward_context
|
||||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||||
from vllm.model_executor.model_loader import get_model_loader
|
from vllm.model_executor.model_loader import get_model_loader
|
||||||
from vllm.model_executor.model_loader.utils import (
|
from vllm.model_executor.model_loader.utils import \
|
||||||
process_weights_after_loading, set_default_torch_dtype)
|
process_weights_after_loading
|
||||||
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
|
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
@@ -24,7 +24,13 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
|
|||||||
TorchairDeepSeekMTP
|
TorchairDeepSeekMTP
|
||||||
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
||||||
TorchairCommonAttentionMetadata)
|
TorchairCommonAttentionMetadata)
|
||||||
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
|
||||||
|
vllm_version_is)
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||||
|
else:
|
||||||
|
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||||
|
|
||||||
PADDING_SLOT_ID = -1
|
PADDING_SLOT_ID = -1
|
||||||
|
|
||||||
|
|||||||
@@ -72,8 +72,7 @@ from vllm.pooling_params import PoolingParams
|
|||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
|
from vllm.utils import cdiv, is_pin_memory_available
|
||||||
get_dtype_size, is_pin_memory_available)
|
|
||||||
from vllm.utils.jsontree import json_map_leaves
|
from vllm.utils.jsontree import json_map_leaves
|
||||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||||
from vllm.v1.attention.backends.utils import (
|
from vllm.v1.attention.backends.utils import (
|
||||||
@@ -145,6 +144,13 @@ if prefill_context_parallel_enable():
|
|||||||
get_prefill_context_model_parallel_rank,
|
get_prefill_context_model_parallel_rank,
|
||||||
get_prefill_context_model_parallel_world_size)
|
get_prefill_context_model_parallel_world_size)
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||||
|
get_dtype_size)
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
|
||||||
|
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
|
|
||||||
if vllm_version_is("0.11.0"):
|
if vllm_version_is("0.11.0"):
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
|||||||
if vllm_version_is("0.11.0"):
|
if vllm_version_is("0.11.0"):
|
||||||
from vllm.utils import swap_dict_values
|
from vllm.utils import swap_dict_values
|
||||||
else:
|
else:
|
||||||
from vllm.utils.collections import swap_dict_values
|
from vllm.utils.collection_utils import swap_dict_values
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ from vllm.logger import logger
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
|
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||||
@@ -51,7 +50,7 @@ from vllm_ascend.platform import NPUPlatform
|
|||||||
from vllm_ascend.utils import (init_ascend_soc_version,
|
from vllm_ascend.utils import (init_ascend_soc_version,
|
||||||
prefill_context_parallel_enable,
|
prefill_context_parallel_enable,
|
||||||
register_ascend_customop, sleep_mode_enabled,
|
register_ascend_customop, sleep_mode_enabled,
|
||||||
try_register_lib)
|
try_register_lib, vllm_version_is)
|
||||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||||
|
|
||||||
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
|
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
|
||||||
@@ -66,6 +65,12 @@ torch_non_c_binding_in_graph_functions_npu[
|
|||||||
torch._dynamo.trace_rules.torch_name_rule_map.append(
|
torch._dynamo.trace_rules.torch_name_rule_map.append(
|
||||||
torch_non_c_binding_in_graph_functions_npu) # noqa: E402
|
torch_non_c_binding_in_graph_functions_npu) # noqa: E402
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
|
||||||
|
else:
|
||||||
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
class NPUWorker(WorkerBase):
|
class NPUWorker(WorkerBase):
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user