Drop 0.11.0 support (#4377)
There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||
vllm_version:
|
||||
required: false
|
||||
default: "v0.11.0"
|
||||
default: "2918c1b49c88c29783c86f78d2c4221cb9622379"
|
||||
type: string
|
||||
description: vllm version to use
|
||||
vllm_ascend_remote_url:
|
||||
|
||||
2
.github/workflows/nightly_benchmarks.yaml
vendored
2
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- vllm_branch: v0.11.0
|
||||
- vllm_branch: 2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
vllm_ascend_branch: main
|
||||
max-parallel: 1
|
||||
container:
|
||||
|
||||
4
.github/workflows/vllm_ascend_test.yaml
vendored
4
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -83,7 +83,7 @@ jobs:
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
|
||||
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379]
|
||||
steps:
|
||||
- name: Install packages
|
||||
run: |
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
|
||||
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -69,7 +69,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
|
||||
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
@@ -86,7 +86,7 @@ jobs:
|
||||
tests: tests/e2e/nightly/ops
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
vllm: v0.11.0
|
||||
vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||
@@ -125,7 +125,7 @@ jobs:
|
||||
- Qwen3-Next-80B-A3B-Instruct
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||
with:
|
||||
vllm: v0.11.0
|
||||
vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
||||
|
||||
@@ -136,7 +136,7 @@ jobs:
|
||||
tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
vllm: v0.11.0
|
||||
vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
- DeepSeek-V2-Lite
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||
with:
|
||||
vllm: v0.11.0
|
||||
vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
runner: ${{ matrix.runner }}
|
||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||
model_list: ${{ toJson(matrix.model_list) }}
|
||||
|
||||
@@ -46,8 +46,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -37,8 +37,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -34,9 +34,10 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -45,8 +45,10 @@ RUN apt-get update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -48,9 +48,10 @@ RUN yum update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -48,9 +48,10 @@ RUN yum update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -63,11 +63,7 @@ import torch
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_open_port
|
||||
else:
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@@ -67,13 +67,8 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
||||
from safetensors.torch import load_file
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import GiB_bytes, get_open_port
|
||||
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@@ -20,11 +20,7 @@ import os
|
||||
|
||||
import torch
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import GiB_bytes
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@@ -67,13 +67,8 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
||||
from safetensors.torch import load_file
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import GiB_bytes, get_open_port
|
||||
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@@ -45,6 +45,7 @@ from vllm.inputs import TextPrompt
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.model_utils import (TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs)
|
||||
@@ -54,12 +55,6 @@ from vllm_ascend.ascend_config import clear_ascend_config
|
||||
# we not explicitly patch here, some of them might be effectiveless
|
||||
# in pytest scenario
|
||||
from vllm_ascend.utils import adapt_patch # noqa E402
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_open_port
|
||||
else:
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
adapt_patch(True)
|
||||
adapt_patch(False)
|
||||
|
||||
@@ -23,13 +23,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_open_port
|
||||
else:
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B",
|
||||
|
||||
@@ -19,14 +19,9 @@ from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_open_port
|
||||
else:
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
|
||||
|
||||
|
||||
@@ -18,15 +18,10 @@ from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_open_port
|
||||
else:
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-32B",
|
||||
|
||||
@@ -25,7 +25,6 @@ from vllm.assets.image import ImageAsset
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
MODELS = [
|
||||
"OpenGVLab/InternVL2-8B",
|
||||
@@ -34,13 +33,6 @@ MODELS = [
|
||||
"OpenGVLab/InternVL3_5-8B",
|
||||
]
|
||||
|
||||
# skip testing InternVL3-8B and InternVL3_5-8B on 0.11.0 due to https://github.com/vllm-project/vllm-ascend/issues/3925.
|
||||
if vllm_version_is("0.11.0"):
|
||||
MODELS = [
|
||||
"OpenGVLab/InternVL2-8B",
|
||||
"OpenGVLab/InternVL2_5-8B",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_internvl_basic(model: str):
|
||||
|
||||
@@ -23,16 +23,11 @@ from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.utils import fork_new_process_for_each_test
|
||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import GiB_bytes
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
|
||||
@@ -9,6 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.hashing import sha256
|
||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||
init_none_hash)
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
@@ -21,12 +22,6 @@ from vllm.v1.structured_output import StructuredOutputManager
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.core.scheduler import AscendScheduler
|
||||
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import sha256
|
||||
else:
|
||||
from vllm.utils.hashing import sha256
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
MODEL = "Qwen3-0.6B"
|
||||
@@ -181,23 +176,13 @@ class TestAscendScheduler(TestBase):
|
||||
)
|
||||
cache_config.num_gpu_blocks = 10000
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
scheduler = AscendScheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
structured_output_manager=MagicMock(
|
||||
spec=StructuredOutputManager),
|
||||
)
|
||||
else:
|
||||
scheduler = AscendScheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
block_size=block_size,
|
||||
structured_output_manager=MagicMock(
|
||||
spec=StructuredOutputManager),
|
||||
)
|
||||
scheduler = AscendScheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
block_size=block_size,
|
||||
structured_output_manager=MagicMock(spec=StructuredOutputManager),
|
||||
)
|
||||
|
||||
should_advance = MagicMock()
|
||||
should_advance.return_value = False
|
||||
|
||||
@@ -13,13 +13,7 @@ from unittest.mock import MagicMock, patch
|
||||
import msgspec
|
||||
import zmq
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import make_zmq_path
|
||||
else:
|
||||
from vllm.utils.network_utils import make_zmq_path
|
||||
from vllm.utils.network_utils import make_zmq_path
|
||||
|
||||
fake_engine = types.ModuleType("mooncake.engine")
|
||||
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]
|
||||
|
||||
@@ -10,6 +10,7 @@ import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
||||
ModelConfig, SchedulerConfig, VllmConfig)
|
||||
from vllm.utils.hashing import sha256
|
||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||
init_none_hash)
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
@@ -19,13 +20,6 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import sha256
|
||||
else:
|
||||
from vllm.utils.hashing import sha256
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
|
||||
|
||||
@@ -111,21 +105,14 @@ def create_scheduler(
|
||||
],
|
||||
)
|
||||
vllm_config.cache_config.num_gpu_blocks = num_blocks
|
||||
if vllm_version_is("0.11.0"):
|
||||
return Scheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
else:
|
||||
return Scheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
block_size=block_size,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
|
||||
return Scheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
block_size=block_size,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
|
||||
|
||||
_none_hash_initialized = False
|
||||
|
||||
@@ -22,7 +22,6 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class DummyDeviceConfig:
|
||||
@@ -174,11 +173,7 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path):
|
||||
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
|
||||
lambda *a, **k: None)
|
||||
# patch get_ip
|
||||
if vllm_version_is("0.11.0"):
|
||||
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
|
||||
else:
|
||||
monkeypatch.setattr("vllm.utils.network_utils.get_ip",
|
||||
lambda: "127.0.0.1")
|
||||
monkeypatch.setattr("vllm.utils.network_utils.get_ip", lambda: "127.0.0.1")
|
||||
# patch find_free_port
|
||||
monkeypatch.setattr(
|
||||
"vllm_ascend.model_loader.netloader.netloader.find_free_port",
|
||||
|
||||
@@ -9,7 +9,6 @@ from vllm.model_executor.layers.mla import MLAModules
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.models.layers.mla import (AscendMultiHeadLatentAttention,
|
||||
IndexerWrapper)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class TestIndexerWrapper(TestBase):
|
||||
@@ -85,68 +84,35 @@ class TestAscendMultiHeadLatentAttention(TestBase):
|
||||
"vllm_ascend.models.layers.mla.get_tensor_model_parallel_world_size")
|
||||
def test_initialization(self, mock_tp_size, mock_ascend_config,
|
||||
mock_get_vllm_config):
|
||||
if vllm_version_is("0.11.0"):
|
||||
with patch("vllm_ascend.models.layers.mla.Attention",
|
||||
return_value=True):
|
||||
mock_tp_size.return_value = 1
|
||||
mock_ascend_config.return_value.enable_shared_expert_dp = False
|
||||
mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||
mock_vllm_config.model_config.hf_config = MagicMock(
|
||||
num_hidden_layers=32, first_k_dense_replace=False)
|
||||
mock_get_vllm_config.return_value = mock_vllm_config
|
||||
mock_vllm_config.compilation_config = CompilationConfig()
|
||||
|
||||
attn = AscendMultiHeadLatentAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
scale=self.scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
mla_modules=self.mock_mla_modules,
|
||||
cache_config=self.mock_cache_config,
|
||||
quant_config=self.mock_quant_config,
|
||||
prefix=self.prefix,
|
||||
)
|
||||
with patch("vllm_ascend.models.layers.mla.MLAAttention",
|
||||
return_value=True):
|
||||
mock_tp_size.return_value = 2
|
||||
mock_ascend_config.return_value.enable_shared_expert_dp = True
|
||||
mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||
mock_vllm_config.model_config.hf_config = MagicMock(
|
||||
num_hidden_layers=32, first_k_dense_replace=True)
|
||||
mock_get_vllm_config.return_value = mock_vllm_config
|
||||
mock_vllm_config.compilation_config = CompilationConfig()
|
||||
|
||||
self.assertEqual(attn.hidden_size, self.hidden_size)
|
||||
self.assertEqual(attn.kv_lora_rank, self.kv_lora_rank)
|
||||
self.assertEqual(attn.debug_layer_idx, 0)
|
||||
self.assertIsNotNone(attn.mla_attn)
|
||||
self.assertIn(
|
||||
self.prefix,
|
||||
mock_vllm_config.compilation_config.static_forward_context)
|
||||
else:
|
||||
with patch("vllm_ascend.models.layers.mla.MLAAttention",
|
||||
return_value=True):
|
||||
mock_tp_size.return_value = 2
|
||||
mock_ascend_config.return_value.enable_shared_expert_dp = True
|
||||
mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||
mock_vllm_config.model_config.hf_config = MagicMock(
|
||||
num_hidden_layers=32, first_k_dense_replace=True)
|
||||
mock_get_vllm_config.return_value = mock_vllm_config
|
||||
mock_vllm_config.compilation_config = CompilationConfig()
|
||||
attn = AscendMultiHeadLatentAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
scale=self.scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
mla_modules=self.mock_mla_modules,
|
||||
cache_config=self.mock_cache_config,
|
||||
quant_config=self.mock_quant_config,
|
||||
prefix=self.prefix,
|
||||
)
|
||||
|
||||
attn = AscendMultiHeadLatentAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
scale=self.scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
mla_modules=self.mock_mla_modules,
|
||||
cache_config=self.mock_cache_config,
|
||||
quant_config=self.mock_quant_config,
|
||||
prefix=self.prefix,
|
||||
)
|
||||
|
||||
self.assertEqual(attn.tp_size, 2)
|
||||
self.assertTrue(attn.enable_shared_expert_dp)
|
||||
self.assertIsNotNone(attn.mla_attn)
|
||||
self.assertEqual(attn.tp_size, 2)
|
||||
self.assertTrue(attn.enable_shared_expert_dp)
|
||||
self.assertIsNotNone(attn.mla_attn)
|
||||
|
||||
@patch("vllm_ascend.models.layers.mla.torch.ops.vllm.mla_forward")
|
||||
@patch("vllm_ascend.models.layers.mla.get_current_vllm_config")
|
||||
@@ -164,41 +130,22 @@ class TestAscendMultiHeadLatentAttention(TestBase):
|
||||
num_hidden_layers=32, first_k_dense_replace=False)
|
||||
mock_get_vllm_config.return_value = mock_vllm_config
|
||||
mock_vllm_config.compilation_config = CompilationConfig()
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
with patch("vllm_ascend.models.layers.mla.Attention",
|
||||
return_value=True):
|
||||
attn = AscendMultiHeadLatentAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
scale=self.scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
mla_modules=self.mock_mla_modules,
|
||||
cache_config=self.mock_cache_config,
|
||||
quant_config=self.mock_quant_config,
|
||||
prefix=self.prefix,
|
||||
)
|
||||
else:
|
||||
with patch("vllm_ascend.models.layers.mla.MLAAttention",
|
||||
return_value=True):
|
||||
attn = AscendMultiHeadLatentAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
scale=self.scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
mla_modules=self.mock_mla_modules,
|
||||
cache_config=self.mock_cache_config,
|
||||
quant_config=self.mock_quant_config,
|
||||
prefix=self.prefix,
|
||||
)
|
||||
with patch("vllm_ascend.models.layers.mla.MLAAttention",
|
||||
return_value=True):
|
||||
attn = AscendMultiHeadLatentAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
scale=self.scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
mla_modules=self.mock_mla_modules,
|
||||
cache_config=self.mock_cache_config,
|
||||
quant_config=self.mock_quant_config,
|
||||
prefix=self.prefix,
|
||||
)
|
||||
positions = torch.tensor([0, 1, 2])
|
||||
hidden_states = torch.randn(3, self.hidden_size)
|
||||
|
||||
|
||||
@@ -3,18 +3,13 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import PlatformEnum
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config.compilation import CompilationLevel
|
||||
else:
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
|
||||
|
||||
|
||||
class TestNPUPlatform(TestBase):
|
||||
@@ -313,16 +308,10 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertTrue("Compilation disabled, using eager mode by default" in
|
||||
cm.output[0])
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
@@ -348,10 +337,7 @@ class TestNPUPlatform(TestBase):
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
else:
|
||||
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
|
||||
with self.assertLogs(logger="vllm", level="WARNING") as cm:
|
||||
from vllm_ascend import platform
|
||||
@@ -359,16 +345,11 @@ class TestNPUPlatform(TestBase):
|
||||
importlib.reload(platform)
|
||||
self.platform.check_and_update_config(vllm_config)
|
||||
self.assertTrue("NPU does not support" in cm.output[0])
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -396,16 +377,10 @@ class TestNPUPlatform(TestBase):
|
||||
"cudagraph_mode is not support on NPU. falling back to NONE" in
|
||||
cm.output[0])
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -431,10 +406,7 @@ class TestNPUPlatform(TestBase):
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
vllm_config.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
else:
|
||||
vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
||||
vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
||||
|
||||
with self.assertLogs(logger="vllm", level="INFO") as cm:
|
||||
from vllm_ascend import platform
|
||||
@@ -443,16 +415,10 @@ class TestNPUPlatform(TestBase):
|
||||
self.platform.check_and_update_config(vllm_config)
|
||||
self.assertTrue("Torchair compilation enabled" in cm.output[0])
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -658,12 +624,9 @@ class TestNPUPlatform(TestBase):
|
||||
|
||||
def test_get_punica_wrapper(self):
|
||||
result = self.platform.get_punica_wrapper()
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110")
|
||||
else:
|
||||
self.assertEqual(result,
|
||||
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
|
||||
|
||||
self.assertEqual(result,
|
||||
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
|
||||
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
@patch("torch.npu.max_memory_allocated")
|
||||
@@ -742,16 +705,11 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertTrue(
|
||||
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode" in cm.output[0])
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
VllmConfig.compilation_config.level,
|
||||
CompilationLevel.PIECEWISE,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
VllmConfig.compilation_config.mode,
|
||||
CompilationMode.VLLM_COMPILE,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
VllmConfig.compilation_config.mode,
|
||||
CompilationMode.VLLM_COMPILE,
|
||||
)
|
||||
self.assertEqual(
|
||||
VllmConfig.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.PIECEWISE,
|
||||
|
||||
@@ -274,46 +274,8 @@ class TestUtils(TestBase):
|
||||
utils.update_aclgraph_sizes(test_vllm_config)
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
|
||||
if utils.vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
137,
|
||||
len(test_vllm_config.compilation_config.cudagraph_capture_sizes
|
||||
))
|
||||
else:
|
||||
self.assertEqual(
|
||||
0,
|
||||
len(test_vllm_config.compilation_config.cudagraph_capture_sizes
|
||||
))
|
||||
return
|
||||
|
||||
test_vllm_config.speculative_config = mock.MagicMock()
|
||||
test_vllm_config.speculative_config.num_speculative_tokens = 2
|
||||
test_vllm_config.speculative_config.draft_model_config = mock.MagicMock(
|
||||
)
|
||||
test_vllm_config.speculative_config.draft_model_config.hf_config = mock.MagicMock(
|
||||
)
|
||||
test_vllm_config.speculative_config.draft_model_config.hf_config.num_hidden_layers = 2
|
||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||
utils.update_aclgraph_sizes(test_vllm_config)
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
self.assertEqual(
|
||||
111,
|
||||
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
|
||||
|
||||
# max_num_batch_sizes >= len(original_sizes)
|
||||
test_compilation_config = CompilationConfig(
|
||||
cudagraph_capture_sizes=[1, 2, 3])
|
||||
test_vllm_config = VllmConfig(
|
||||
model_config=test_model_config,
|
||||
compilation_config=test_compilation_config,
|
||||
parallel_config=test_parallel_config,
|
||||
)
|
||||
utils.update_aclgraph_sizes(test_vllm_config)
|
||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||
utils.update_aclgraph_sizes(test_vllm_config)
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
self.assertEqual(
|
||||
3,
|
||||
0,
|
||||
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
|
||||
|
||||
@mock.patch("vllm.model_executor.custom_op.CustomOp")
|
||||
|
||||
@@ -7,7 +7,6 @@ from vllm.config import CacheConfig, VllmConfig
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class TestTorchairMtpProposer(PytestBase):
|
||||
@@ -40,14 +39,8 @@ class TestTorchairMtpProposer(PytestBase):
|
||||
mocker.patch(
|
||||
"vllm_ascend.torchair.torchair_mtp_proposer.MtpProposer.__init__",
|
||||
return_value=None)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
mock_set_default_dtype = mocker.patch(
|
||||
'vllm.model_executor.model_loader.utils.set_default_torch_dtype'
|
||||
)
|
||||
else:
|
||||
mock_set_default_dtype = mocker.patch(
|
||||
'vllm.utils.torch_utils.set_default_torch_dtype')
|
||||
mock_set_default_dtype = mocker.patch(
|
||||
'vllm.utils.torch_utils.set_default_torch_dtype')
|
||||
mock_set_default_dtype.return_value.__enter__.return_value = None
|
||||
|
||||
mock_model_loader = MagicMock()
|
||||
|
||||
@@ -4,10 +4,8 @@ import torch
|
||||
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
init_cache_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is(
|
||||
"0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
|
||||
init_cache_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
|
||||
|
||||
|
||||
class TestNPUTorchairWorker(TestBase):
|
||||
|
||||
@@ -20,19 +20,14 @@ import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||
from vllm.v1.pool.metadata import PoolingMetadata
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessors
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
else:
|
||||
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||
|
||||
VOCAB_SIZE = 1024
|
||||
NUM_OUTPUT_TOKENS = 20
|
||||
MAX_PROMPT_SIZE = 100
|
||||
|
||||
@@ -6,10 +6,8 @@ import torch
|
||||
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
init_cached_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is(
|
||||
"0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
|
||||
init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
|
||||
|
||||
|
||||
class TestNPUWorker(TestBase):
|
||||
@@ -189,26 +187,15 @@ class TestNPUWorker(TestBase):
|
||||
# Create NPUWorker instance
|
||||
from vllm_ascend.worker.worker_v1 import NPUWorker
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
|
||||
{"float32": torch.float32}):
|
||||
worker = NPUWorker(
|
||||
vllm_config=self.vllm_config_mock,
|
||||
local_rank=self.local_rank,
|
||||
rank=self.rank,
|
||||
distributed_init_method=self.distributed_init_method,
|
||||
is_driver_worker=self.is_driver_worker,
|
||||
)
|
||||
else:
|
||||
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
|
||||
{"float32": torch.float32}):
|
||||
worker = NPUWorker(
|
||||
vllm_config=self.vllm_config_mock,
|
||||
local_rank=self.local_rank,
|
||||
rank=self.rank,
|
||||
distributed_init_method=self.distributed_init_method,
|
||||
is_driver_worker=self.is_driver_worker,
|
||||
)
|
||||
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
|
||||
{"float32": torch.float32}):
|
||||
worker = NPUWorker(
|
||||
vllm_config=self.vllm_config_mock,
|
||||
local_rank=self.local_rank,
|
||||
rank=self.rank,
|
||||
distributed_init_method=self.distributed_init_method,
|
||||
is_driver_worker=self.is_driver_worker,
|
||||
)
|
||||
|
||||
# Verify cache_dtype is set to custom value
|
||||
self.assertEqual(worker.cache_dtype, torch.float32)
|
||||
|
||||
@@ -31,14 +31,7 @@ from vllm.distributed import (get_dcp_group,
|
||||
get_decode_context_model_parallel_rank,
|
||||
get_decode_context_model_parallel_world_size)
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.attention.backends.utils import AttentionCGSupport
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
@@ -20,14 +20,7 @@ from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
UnquantizedLinearMethod)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv, round_down
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv, round_down
|
||||
|
||||
from vllm.utils.math_utils import cdiv, round_down
|
||||
from vllm.v1.attention.backends.utils import AttentionCGSupport
|
||||
|
||||
from vllm_ascend import envs
|
||||
|
||||
@@ -55,8 +55,6 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
from vllm.v1.utils import ConstantList
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class RecomputeScheduler(SchedulerInterface):
|
||||
"""This Scheduler extends vllm's original v1 scheduler of version 0.11
|
||||
@@ -587,14 +585,9 @@ class RecomputeScheduler(SchedulerInterface):
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
if vllm_version_is("0.11.0"):
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
else:
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
|
||||
@@ -22,14 +22,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_events import KVEventBatch
|
||||
from vllm.logger import logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
@@ -39,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendScheduler(Scheduler):
|
||||
"""This Scheduler extends vllm's original v1 scheduler
|
||||
@@ -71,14 +62,9 @@ class AscendScheduler(Scheduler):
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
# Call the parent class's __init__ method
|
||||
if vllm_version_is("0.11.0"):
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
else:
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, block_size,
|
||||
mm_registry, include_finished_set, log_stats)
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, block_size, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
|
||||
# Initialize common attributes
|
||||
self._initialize_common()
|
||||
@@ -462,14 +448,9 @@ class AscendScheduler(Scheduler):
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
if vllm_version_is("0.11.0"):
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
else:
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
|
||||
@@ -33,8 +33,6 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class BudgetRefiner:
|
||||
"""This budget refiner can make dynamic adjustment to the token budget
|
||||
@@ -130,14 +128,9 @@ class SchedulerDynamicBatch(Scheduler):
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
if vllm_version_is("0.11.0"):
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
else:
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, block_size,
|
||||
mm_registry, include_finished_set, log_stats)
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, block_size, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
self.running: list[Request] = []
|
||||
self.budget_refiner = BudgetRefiner(
|
||||
default_budget=self.scheduler_config.max_num_batched_tokens,
|
||||
@@ -540,14 +533,9 @@ class SchedulerDynamicBatch(Scheduler):
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
if vllm_version_is("0.11.0"):
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
else:
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(
|
||||
|
||||
@@ -10,17 +10,12 @@ import vllm.envs as envs
|
||||
import zmq
|
||||
from vllm.config import KVTransferConfig, VllmConfig
|
||||
from vllm.utils import logger
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.utils.torch_utils import get_dtype_size
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
|
||||
CPUKVCacheManager
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_dtype_size, make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.utils.torch_utils import get_dtype_size
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -33,17 +33,13 @@ from vllm.v1.request import Request, RequestStatus
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.distributed.utils import get_transfer_timeout_value
|
||||
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
|
||||
prefill_context_parallel_enable,
|
||||
vllm_version_is)
|
||||
prefill_context_parallel_enable)
|
||||
|
||||
if prefill_context_parallel_enable():
|
||||
from vllm.distributed.parallel_state import \
|
||||
get_prefill_context_model_parallel_rank
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
TORCH_DTYPE_TO_NPU_DTYPE = {
|
||||
torch.half: llm_datadist.DataType.DT_FLOAT16,
|
||||
|
||||
@@ -10,14 +10,7 @@ import torch
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
|
||||
KVConnectorMetadata
|
||||
from vllm.utils import logger
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.core.sched.output import NewRequestData
|
||||
|
||||
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import Generator, List, Optional, Union
|
||||
import torch
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.utils import logger
|
||||
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
|
||||
|
||||
from vllm_ascend.distributed.mooncake.config_data import (
|
||||
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
|
||||
@@ -16,12 +17,6 @@ from vllm_ascend.distributed.mooncake.kv_transfer import (
|
||||
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
|
||||
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
|
||||
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_kv_cache_torch_dtype
|
||||
else:
|
||||
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
|
||||
|
||||
|
||||
class MooncakeEngine:
|
||||
|
||||
@@ -6,18 +6,13 @@ from mooncake.store import ReplicateConfig # type: ignore
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
|
||||
from vllm.utils import logger
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
from vllm_ascend.distributed.mooncake.config_data import MooncakeEngineKey
|
||||
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
from .config_data import MooncakeStoreConfig
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
METADATA_BYTES_LEN = 24
|
||||
BASE_PORT = int(os.getenv("VLLM_BASE_PORT", "8790"))
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.utils import logger
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import Request
|
||||
@@ -18,12 +19,6 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
|
||||
from vllm_ascend.distributed.mooncake.config_data import (
|
||||
LoadSpec, MooncakeConnectorMetadata, ReqMeta, RequestTracker)
|
||||
from vllm_ascend.distributed.mooncake.mooncake_engine import MooncakeEngine
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
|
||||
|
||||
class MooncakeConnectorV1(KVConnectorBase_V1):
|
||||
|
||||
@@ -37,7 +37,7 @@ import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
|
||||
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
|
||||
from vllm_ascend.distributed.utils import get_transfer_timeout_value
|
||||
from vllm_ascend.utils import prefill_context_parallel_enable, vllm_version_is
|
||||
from vllm_ascend.utils import prefill_context_parallel_enable
|
||||
|
||||
# isort: off
|
||||
if prefill_context_parallel_enable():
|
||||
@@ -46,10 +46,7 @@ if prefill_context_parallel_enable():
|
||||
)
|
||||
# isort: on
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
|
||||
@@ -28,6 +28,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
||||
get_tp_group, get_world_group)
|
||||
from vllm.utils import logger
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
@@ -35,12 +36,6 @@ from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.distributed.utils import (align_memory,
|
||||
get_transfer_timeout_value,
|
||||
kv_alltoall_and_rearrange)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
|
||||
@@ -2,17 +2,11 @@ import numpy as np
|
||||
import torch
|
||||
from vllm.attention import AttentionBackend
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
|
||||
TransferResult, TransferSpec)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import is_pin_memory_available
|
||||
else:
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
|
||||
@@ -349,64 +349,3 @@ class PunicaWrapperNPU(PunicaWrapperBase):
|
||||
bgmv_expand(buffer, lora_b_stacked, y, indices, add_inputs=True)
|
||||
|
||||
y = y.view_as(y_org)
|
||||
|
||||
|
||||
class PunicaWrapperNPU0110(PunicaWrapperNPU):
|
||||
# NOTE: remove me when 0.11.0 id dropped
|
||||
def add_lora_linear( # type: ignore[override]
|
||||
self,
|
||||
y: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
lora_a_stacked: Tuple[torch.Tensor, ...],
|
||||
lora_b_stacked: Tuple[torch.Tensor, ...],
|
||||
lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
|
||||
scale: float,
|
||||
output_slices: Tuple[int, ...],
|
||||
*,
|
||||
buffer: Optional[Tuple[torch.Tensor, ...]] = None,
|
||||
**kwargs) -> None:
|
||||
"""
|
||||
Applicable to linear-related lora.
|
||||
|
||||
Semantics:
|
||||
for i in range(len(lora_a_stacked)):
|
||||
y[i] += (
|
||||
x[i].unsqueeze(0)
|
||||
@ lora_a_stacked[indices[i], layer_idx, :, :]
|
||||
@ lora_b_stacked[indices[i], layer_idx, :, :]
|
||||
* scale
|
||||
).squeeze(0)+lora_bias_stacked[i]
|
||||
|
||||
Args:
|
||||
y (torch.Tensor): Output tensor. Will be changed in-place.
|
||||
x (torch.Tensor): Input tensor
|
||||
lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
|
||||
lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
|
||||
lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
|
||||
scale (float): Scaling factor.
|
||||
output_slices (Tuple[int, ...]): Every slice's size.
|
||||
buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
|
||||
"""
|
||||
|
||||
assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
|
||||
if lora_bias_stacked is not None:
|
||||
assert len(lora_bias_stacked) == len(output_slices)
|
||||
y = self._apply_bias(self.token_lora_indices, y, output_slices,
|
||||
lora_bias_stacked)
|
||||
|
||||
if buffer is None:
|
||||
r = lora_b_stacked[0].size(-1)
|
||||
# We set the buffer to be float32 by default, consistent with the
|
||||
# triton op
|
||||
buffer = tuple(
|
||||
torch.zeros(
|
||||
(x.size(0), r), dtype=torch.float32, device=x.device)
|
||||
for _ in range(len(output_slices)))
|
||||
self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
|
||||
self.add_expand(y,
|
||||
buffer,
|
||||
lora_b_stacked,
|
||||
None,
|
||||
output_slices,
|
||||
add_inputs=True,
|
||||
**kwargs)
|
||||
|
||||
@@ -29,18 +29,12 @@ from vllm.model_executor.model_loader.base_loader import BaseModelLoader
|
||||
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
|
||||
from vllm.model_executor.model_loader.utils import (
|
||||
initialize_model, process_weights_after_loading)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
from .interaction.elastic import ElasticServer
|
||||
from .load import elastic_load
|
||||
from .utils import find_free_port, is_valid_path_prefix
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
else:
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
|
||||
@register_model_loader("netloader")
|
||||
class ModelNetLoaderElastic(BaseModelLoader):
|
||||
@@ -207,10 +201,8 @@ class ModelNetLoaderElastic(BaseModelLoader):
|
||||
if model is not None and (
|
||||
(self.listen_port and self.listen_port in range(1024, 65535)) or
|
||||
(self.listen_port is None)):
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import get_ip
|
||||
else:
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
from vllm.utils.network_utils import get_ip
|
||||
driver_ip = get_ip()
|
||||
|
||||
if driver_ip == '0.0.0.0':
|
||||
|
||||
@@ -24,32 +24,16 @@ from typing import Optional
|
||||
import torch
|
||||
from torch import nn
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.config import CacheConfig, get_current_vllm_config
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.model_executor.layers.mla import MLAModules
|
||||
from vllm.model_executor.layers.mla import (MLAModules,
|
||||
MultiHeadLatentAttentionWrapper)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.attention import Attention
|
||||
from vllm.model_executor.layers.mla import \
|
||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||
from vllm.utils import direct_register_custom_op
|
||||
else:
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.attention import Attention
|
||||
from vllm.model_executor.layers.mla import \
|
||||
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
|
||||
else:
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
|
||||
|
||||
|
||||
class IndexerWrapper(nn.Module):
|
||||
@@ -81,7 +65,6 @@ class IndexerWrapper(nn.Module):
|
||||
return
|
||||
|
||||
|
||||
# TODO(whx): adapt v0.11.0 and DSA
|
||||
class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
|
||||
|
||||
def __init__(
|
||||
@@ -119,61 +102,30 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
|
||||
ascend_indexer = IndexerWrapper(mla_modules.indexer)
|
||||
else:
|
||||
ascend_indexer = None
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.mla_attn = Attention(
|
||||
num_heads=num_heads,
|
||||
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
|
||||
scale=scale,
|
||||
num_kv_heads=1,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_mla=True,
|
||||
indexer=ascend_indexer,
|
||||
use_sparse=mla_modules.is_sparse,
|
||||
# MLA Args
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
qk_head_dim=self.qk_head_dim,
|
||||
rotary_emb=mla_modules.rotary_emb,
|
||||
fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
|
||||
q_b_proj=mla_modules.q_b_proj,
|
||||
q_a_layernorm=mla_modules.q_a_layernorm,
|
||||
q_proj=mla_modules.q_proj,
|
||||
kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=mla_modules.kv_a_layernorm,
|
||||
kv_b_proj=mla_modules.kv_b_proj,
|
||||
o_proj=mla_modules.o_proj,
|
||||
)
|
||||
else:
|
||||
self.mla_attn = MLAAttention(
|
||||
num_heads=num_heads,
|
||||
scale=scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
kv_b_proj=mla_modules.kv_b_proj,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_sparse=mla_modules.is_sparse,
|
||||
indexer=ascend_indexer,
|
||||
# extra args
|
||||
rotary_emb=mla_modules.rotary_emb,
|
||||
fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
|
||||
q_b_proj=mla_modules.q_b_proj,
|
||||
q_a_layernorm=mla_modules.q_a_layernorm,
|
||||
q_proj=mla_modules.q_proj,
|
||||
kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=mla_modules.kv_a_layernorm,
|
||||
o_proj=mla_modules.o_proj,
|
||||
)
|
||||
self.mla_attn = MLAAttention(
|
||||
num_heads=num_heads,
|
||||
scale=scale,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
kv_b_proj=mla_modules.kv_b_proj,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_sparse=mla_modules.is_sparse,
|
||||
indexer=ascend_indexer,
|
||||
# extra args
|
||||
rotary_emb=mla_modules.rotary_emb,
|
||||
fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
|
||||
q_b_proj=mla_modules.q_b_proj,
|
||||
q_a_layernorm=mla_modules.q_a_layernorm,
|
||||
q_proj=mla_modules.q_proj,
|
||||
kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=mla_modules.kv_a_layernorm,
|
||||
o_proj=mla_modules.o_proj,
|
||||
)
|
||||
|
||||
compilation_config = get_current_vllm_config().compilation_config
|
||||
if prefix in compilation_config.static_forward_context:
|
||||
|
||||
@@ -40,14 +40,11 @@ from vllm.model_executor.models.qwen2_5_vl import (
|
||||
Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration,
|
||||
Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo)
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
from vllm.model_executor.models.vision import conv3d_to_linear_weight
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
|
||||
vllm_version_is)
|
||||
|
||||
if not vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.models.vision import conv3d_to_linear_weight
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
|
||||
|
||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||
@@ -360,9 +357,8 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if not vllm_version_is("0.11.0"):
|
||||
if name.endswith("patch_embed.proj.weight"):
|
||||
loaded_weight = conv3d_to_linear_weight(loaded_weight)
|
||||
if name.endswith("patch_embed.proj.weight"):
|
||||
loaded_weight = conv3d_to_linear_weight(loaded_weight)
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
@@ -537,11 +533,8 @@ class AscendQwen2_5_VLForConditionalGeneration(
|
||||
image_embeds = image_input["image_embeds"].type(self.visual.dtype)
|
||||
else:
|
||||
pixel_values = image_input["pixel_values"].type(self.visual.dtype)
|
||||
if vllm_version_is("0.11.0"):
|
||||
with set_ascend_forward_context(None, self.vllm_config):
|
||||
image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
|
||||
else:
|
||||
with set_ascend_forward_context(None, self.vllm_config):
|
||||
image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
|
||||
|
||||
# Split concatenated embeddings for each image item.
|
||||
merge_size = self.visual.spatial_merge_size
|
||||
@@ -558,13 +551,9 @@ class AscendQwen2_5_VLForConditionalGeneration(
|
||||
else:
|
||||
pixel_values_videos = video_input["pixel_values_videos"].type(
|
||||
self.visual.dtype)
|
||||
if vllm_version_is("0.11.0"):
|
||||
with set_ascend_forward_context(None, self.vllm_config):
|
||||
video_embeds = self.visual(pixel_values_videos,
|
||||
grid_thw=grid_thw)
|
||||
else:
|
||||
with set_ascend_forward_context(None, self.vllm_config):
|
||||
video_embeds = self.visual(pixel_values_videos,
|
||||
grid_thw=grid_thw)
|
||||
|
||||
# Split concatenated embeddings for each video item.
|
||||
merge_size = self.visual.spatial_merge_size
|
||||
|
||||
@@ -38,13 +38,10 @@ from vllm.model_executor.models.qwen2_vl import (
|
||||
Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor,
|
||||
Qwen2VLProcessingInfo)
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
from vllm.model_executor.models.vision import conv3d_to_linear_weight
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
|
||||
vllm_version_is)
|
||||
|
||||
if not vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.models.vision import conv3d_to_linear_weight
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
|
||||
|
||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||
@@ -308,9 +305,8 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer):
|
||||
loaded_params: Set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
if not vllm_version_is("0.11.0"):
|
||||
if name.endswith("patch_embed.proj.weight"):
|
||||
loaded_weight = conv3d_to_linear_weight(loaded_weight)
|
||||
if name.endswith("patch_embed.proj.weight"):
|
||||
loaded_weight = conv3d_to_linear_weight(loaded_weight)
|
||||
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
|
||||
@@ -50,8 +50,6 @@ from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.transformers_utils.configs import Qwen3NextConfig
|
||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
from vllm.model_executor.models.qwen3_next import ( # isort: skip
|
||||
Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextForCausalLM,
|
||||
Qwen3NextGatedDeltaNet, Qwen3NextModel, Qwen3NextSparseMoeBlock,
|
||||
@@ -202,11 +200,8 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
|
||||
spec_query_start_loc = attn_metadata.spec_query_start_loc
|
||||
non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
|
||||
spec_sequence_masks = attn_metadata.spec_sequence_masks
|
||||
if vllm_version_is("0.11.0"):
|
||||
spec_token_masks = attn_metadata.spec_token_masks
|
||||
else:
|
||||
spec_token_indx = attn_metadata.spec_token_indx
|
||||
non_spec_token_indx = attn_metadata.non_spec_token_indx
|
||||
spec_token_indx = attn_metadata.spec_token_indx
|
||||
non_spec_token_indx = attn_metadata.non_spec_token_indx
|
||||
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
|
||||
non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501
|
||||
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
||||
@@ -221,9 +216,6 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
|
||||
|
||||
# 1. Set up dimensions for reshapes later
|
||||
projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens])
|
||||
if vllm_version_is("0.11.0"):
|
||||
if spec_token_masks is not None:
|
||||
spec_token_masks = spec_token_masks[:num_actual_tokens]
|
||||
projected_states_qkvz, projected_states_ba = torch.split(
|
||||
projected_states,
|
||||
[
|
||||
@@ -248,13 +240,9 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
|
||||
mixed_qkv_spec = mixed_qkv
|
||||
mixed_qkv_non_spec = None
|
||||
else:
|
||||
if vllm_version_is("0.11.0"):
|
||||
mixed_qkv_spec = mixed_qkv[spec_token_masks]
|
||||
mixed_qkv_non_spec = mixed_qkv[~spec_token_masks]
|
||||
else:
|
||||
mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
|
||||
mixed_qkv_non_spec = mixed_qkv.index_select(
|
||||
0, non_spec_token_indx)
|
||||
mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
|
||||
mixed_qkv_non_spec = mixed_qkv.index_select(
|
||||
0, non_spec_token_indx)
|
||||
else:
|
||||
mixed_qkv_spec = None
|
||||
mixed_qkv_non_spec = mixed_qkv
|
||||
@@ -322,16 +310,10 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
|
||||
g_non_spec = None
|
||||
beta_non_spec = None
|
||||
else:
|
||||
if vllm_version_is("0.11.0"):
|
||||
g_spec = g[:, spec_token_masks]
|
||||
beta_spec = beta[:, spec_token_masks]
|
||||
g_non_spec = g[:, ~spec_token_masks]
|
||||
beta_non_spec = beta[:, ~spec_token_masks]
|
||||
else:
|
||||
g_spec = g.index_select(1, spec_token_indx)
|
||||
beta_spec = beta.index_select(1, spec_token_indx)
|
||||
g_non_spec = g.index_select(1, non_spec_token_indx)
|
||||
beta_non_spec = beta.index_select(1, non_spec_token_indx)
|
||||
g_spec = g.index_select(1, spec_token_indx)
|
||||
beta_spec = beta.index_select(1, spec_token_indx)
|
||||
g_non_spec = g.index_select(1, non_spec_token_indx)
|
||||
beta_non_spec = beta.index_select(1, non_spec_token_indx)
|
||||
else:
|
||||
g_spec = None
|
||||
beta_spec = None
|
||||
@@ -439,14 +421,9 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
|
||||
dtype=core_attn_out_non_spec.dtype,
|
||||
device=core_attn_out_non_spec.device,
|
||||
)
|
||||
if vllm_version_is("0.11.0"):
|
||||
core_attn_out[:, spec_token_masks] = core_attn_out_spec
|
||||
core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec
|
||||
else:
|
||||
core_attn_out.index_copy_(1, spec_token_indx,
|
||||
core_attn_out_spec)
|
||||
core_attn_out.index_copy_(1, non_spec_token_indx,
|
||||
core_attn_out_non_spec)
|
||||
core_attn_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
|
||||
core_attn_out.index_copy_(1, non_spec_token_indx,
|
||||
core_attn_out_non_spec)
|
||||
elif spec_sequence_masks is not None:
|
||||
core_attn_out = core_attn_out_spec
|
||||
else:
|
||||
|
||||
@@ -19,7 +19,7 @@ from typing import Any, Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.config import CompilationMode, get_current_vllm_config
|
||||
from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.forward_context import get_forward_context
|
||||
@@ -28,6 +28,8 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
|
||||
get_compressed_expert_map)
|
||||
from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
|
||||
SharedFusedMoE
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
@@ -44,17 +46,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
|
||||
is_enable_nz, npu_stream_switch,
|
||||
shared_expert_dp_enabled,
|
||||
shared_experts_calculation_stream,
|
||||
vllm_version_is)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config import CompilationLevel
|
||||
|
||||
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE # type: ignore # isort:skip
|
||||
else:
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
|
||||
SharedFusedMoE
|
||||
shared_experts_calculation_stream)
|
||||
|
||||
|
||||
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
@@ -73,16 +65,9 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
self.use_aclgraph = False
|
||||
else:
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.use_aclgraph = (
|
||||
vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE
|
||||
and not vllm_config.model_config.enforce_eager)
|
||||
else:
|
||||
self.use_aclgraph = (
|
||||
vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE
|
||||
and not vllm_config.model_config.enforce_eager)
|
||||
self.use_aclgraph = (vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE and
|
||||
not vllm_config.model_config.enforce_eager)
|
||||
|
||||
self.transpose = True
|
||||
|
||||
@@ -209,12 +194,8 @@ class AscendFusedMoE(FusedMoE):
|
||||
dtype=vllm_config.model_config.dtype)
|
||||
|
||||
# init moe.
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
else:
|
||||
self.local_num_experts, self.expert_map, _ = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
self.local_num_experts, self.expert_map, _ = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
# static eplb initializing with expert_map_path
|
||||
if self.expert_map_path and os.path.exists(
|
||||
self.expert_map_path) and os.access(self.expert_map_path,
|
||||
|
||||
@@ -7,17 +7,12 @@ from vllm.distributed import (get_dp_group, get_ep_group,
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_reduce_scatter)
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
|
||||
from vllm_ascend.utils import (npu_stream_switch, prefetch_stream,
|
||||
vllm_version_is)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import direct_register_custom_op
|
||||
else:
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
from vllm_ascend.utils import npu_stream_switch, prefetch_stream
|
||||
|
||||
|
||||
def _maybe_all_gather_and_maybe_unpad_impl(
|
||||
|
||||
@@ -3,23 +3,10 @@ import vllm.model_executor.models.config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.model_executor.models.config import MambaModelConfig
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
else:
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
@classmethod
|
||||
def verify_and_update_config(cls, vllm_config) -> None:
|
||||
|
||||
@@ -8,21 +8,14 @@ import vllm.v1.executor.multiproc_executor
|
||||
from vllm import envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.utils.network_utils import (get_distributed_init_method,
|
||||
get_loopback_ip, get_open_port)
|
||||
from vllm.utils.system_utils import get_mp_context
|
||||
from vllm.v1.executor.abstract import FailureCallback
|
||||
from vllm.v1.executor.multiproc_executor import (
|
||||
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
|
||||
set_multiprocessing_worker_envs)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
|
||||
get_mp_context, get_open_port)
|
||||
else:
|
||||
from vllm.utils.network_utils import (get_distributed_init_method,
|
||||
get_loopback_ip, get_open_port)
|
||||
from vllm.utils.system_utils import get_mp_context
|
||||
|
||||
|
||||
class AscendMultiprocExecutor(MultiprocExecutor):
|
||||
supports_pp: bool = True
|
||||
|
||||
@@ -28,9 +28,3 @@ import vllm_ascend.patch.worker.patch_roberta # noqa
|
||||
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||
import vllm_ascend.patch.worker.patch_minicpm # noqa
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa
|
||||
import vllm_ascend.patch.worker.patch_deepseek_v3_2 # noqa
|
||||
|
||||
@@ -1,94 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.deepseek_mtp import \
|
||||
DeepSeekMultiTokenPredictorLayer
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
|
||||
|
||||
|
||||
class SharedHead(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
prefix: str,
|
||||
quant_config: QuantizationConfig = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.head = ParallelLMHead(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "head"),
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
return self.norm(hidden_states)
|
||||
|
||||
|
||||
def predictor_init(self, vllm_config: VllmConfig, prefix: str) -> None:
|
||||
nn.Module.__init__(self)
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.eh_proj = nn.Linear(config.hidden_size * 2,
|
||||
config.hidden_size,
|
||||
bias=False)
|
||||
# We don't need topk_indices_buffer in Ascend
|
||||
topk_indices_buffer = None
|
||||
self.shared_head = SharedHead(config=config,
|
||||
prefix=prefix,
|
||||
quant_config=quant_config)
|
||||
self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix,
|
||||
topk_indices_buffer)
|
||||
|
||||
|
||||
def predictor_forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
previous_hidden_states: torch.Tensor,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
spec_step_index: int = 0,
|
||||
) -> torch.Tensor:
|
||||
assert inputs_embeds is not None
|
||||
# masking inputs at position 0, as not needed by MTP
|
||||
inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
|
||||
inputs_embeds = self.enorm(inputs_embeds)
|
||||
previous_hidden_states = self.hnorm(previous_hidden_states)
|
||||
|
||||
hidden_states = self.eh_proj(
|
||||
torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
|
||||
|
||||
hidden_states, residual = self.mtp_block(positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
residual=None)
|
||||
hidden_states = residual + hidden_states
|
||||
return hidden_states
|
||||
|
||||
|
||||
# Patch this only for aclgraph support, as this is not support in vLLM 0.11.0
|
||||
@support_torch_compile
|
||||
class AscendDeepSeekMTP(DeepSeekMTP):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
DeepSeekMultiTokenPredictorLayer.__init__ = predictor_init
|
||||
if vllm_version_is("0.11.0"):
|
||||
DeepSeekMultiTokenPredictorLayer.forward = predictor_forward
|
||||
@@ -1,108 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from itertools import islice
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import vllm.model_executor.models.deepseek_v2
|
||||
from torch import nn
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import \
|
||||
VocabParallelEmbedding
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
|
||||
from vllm.model_executor.models.utils import (
|
||||
PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class DeepseekV2Model(nn.Module):
|
||||
|
||||
fall_back_to_pt_during_load = False
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
|
||||
self.vocab_size = config.vocab_size
|
||||
self.is_v32 = hasattr(config, "index_topk")
|
||||
topk_indices_buffer = None
|
||||
|
||||
if get_pp_group().is_first_rank:
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.embed_tokens")
|
||||
else:
|
||||
self.embed_tokens = PPMissingLayer()
|
||||
|
||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||
config.num_hidden_layers,
|
||||
lambda prefix: DeepseekV2DecoderLayer(vllm_config, prefix,
|
||||
topk_indices_buffer),
|
||||
prefix=f"{prefix}.layers")
|
||||
|
||||
if get_pp_group().is_last_rank:
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
else:
|
||||
self.norm = PPMissingLayer()
|
||||
self.make_empty_intermediate_tensors = (
|
||||
make_empty_intermediate_tensors_factory(
|
||||
["hidden_states", "residual"], config.hidden_size))
|
||||
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.embed_tokens(input_ids)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors],
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
if get_pp_group().is_first_rank:
|
||||
if inputs_embeds is not None:
|
||||
hidden_states = inputs_embeds
|
||||
else:
|
||||
hidden_states = self.get_input_embeddings(input_ids)
|
||||
residual = None
|
||||
else:
|
||||
assert intermediate_tensors is not None
|
||||
hidden_states = intermediate_tensors["hidden_states"]
|
||||
residual = intermediate_tensors["residual"]
|
||||
|
||||
for layer in islice(self.layers, self.start_layer, self.end_layer):
|
||||
hidden_states, residual = layer(positions, hidden_states, residual)
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
return IntermediateTensors({
|
||||
"hidden_states": hidden_states,
|
||||
"residual": residual
|
||||
})
|
||||
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
|
||||
vllm.model_executor.models.deepseek_v2.DeepseekV2Model = DeepseekV2Model
|
||||
@@ -6,16 +6,11 @@ import vllm.model_executor.layers.mamba.ops.causal_conv1d
|
||||
from vllm_ascend.ops.casual_conv1d import (causal_conv1d_fn,
|
||||
causal_conv1d_update_npu)
|
||||
from vllm_ascend.ops.fla import LayerNormFn, torch_chunk_gated_delta_rule
|
||||
from vllm_ascend.ops.sigmoid_gating import (
|
||||
fused_recurrent_gated_delta_rule_fwd_kernel,
|
||||
fused_recurrent_gated_delta_rule_fwd_kernel_0_11_0)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.ops.sigmoid_gating import \
|
||||
fused_recurrent_gated_delta_rule_fwd_kernel
|
||||
|
||||
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
|
||||
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
|
||||
if vllm_version_is('0.11.0'):
|
||||
vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel_0_11_0
|
||||
else:
|
||||
vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
|
||||
vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
|
||||
vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
|
||||
vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule
|
||||
|
||||
@@ -3,13 +3,7 @@ from torch.nn.parameter import Parameter
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import GiB_bytes
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
|
||||
is_vl_model, prefill_context_parallel_enable,
|
||||
update_aclgraph_sizes,
|
||||
update_cudagraph_capture_sizes,
|
||||
update_default_aclgraph_sizes, vllm_version_is)
|
||||
update_default_aclgraph_sizes)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
@@ -120,10 +120,7 @@ class NPUPlatform(Platform):
|
||||
# initialize ascend config from vllm additional_config
|
||||
ascend_config = init_ascend_config(vllm_config)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config import CompilationLevel
|
||||
else:
|
||||
from vllm.config import CompilationMode # noqa: E402
|
||||
from vllm.config import CompilationMode # noqa: E402
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
model_config = vllm_config.model_config
|
||||
@@ -149,29 +146,17 @@ class NPUPlatform(Platform):
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
if enforce_eager:
|
||||
logger.info("Compilation disabled, using eager mode by default")
|
||||
if vllm_version_is("0.11.0"):
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
else:
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
compilation_config.cudagraph_num_of_warmups = 1
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
if compilation_config.level not in [
|
||||
CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE
|
||||
]:
|
||||
logger.warning(
|
||||
"NPU does not support %s compilation level. Setting CUDAGraphMode to NONE",
|
||||
compilation_config.level)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
else:
|
||||
if compilation_config.mode not in [
|
||||
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
|
||||
]:
|
||||
logger.warning(
|
||||
"NPU does not support %s compilation mode. Setting CUDAGraphMode to NONE",
|
||||
compilation_config.mode)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
if compilation_config.mode not in [
|
||||
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
|
||||
]:
|
||||
logger.warning(
|
||||
"NPU does not support %s compilation mode. Setting CUDAGraphMode to NONE",
|
||||
compilation_config.mode)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
# set CUDAGraphMode to None when torchair is enabled, no mather what compilation_config.level is.
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
@@ -211,96 +196,49 @@ class NPUPlatform(Platform):
|
||||
f"{vllm_config.parallel_config.tensor_parallel_size}")
|
||||
if len(sp_aclgraph_sizes) != len(original_sizes):
|
||||
compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
|
||||
if vllm_version_is("0.11.0"):
|
||||
compilation_config.init_with_cudagraph_sizes(
|
||||
sp_aclgraph_sizes)
|
||||
else:
|
||||
update_cudagraph_capture_sizes(vllm_config,
|
||||
sp_aclgraph_sizes)
|
||||
update_cudagraph_capture_sizes(vllm_config, sp_aclgraph_sizes)
|
||||
|
||||
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
||||
logger.info(
|
||||
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
assert compilation_config.level == CompilationLevel.PIECEWISE, \
|
||||
"When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE"
|
||||
compilation_config.set_splitting_ops_for_v1()
|
||||
compilation_config.use_inductor = False
|
||||
compilation_config.splitting_ops.extend([
|
||||
"vllm.unified_ascend_attention_with_output",
|
||||
"vllm.mla_forward"
|
||||
])
|
||||
update_aclgraph_sizes(vllm_config)
|
||||
elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
|
||||
compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
|
||||
logger.info(
|
||||
"FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
compilation_config.use_inductor = False
|
||||
warning_message = """\033[91m
|
||||
**********************************************************************************
|
||||
* WARNING: You have enabled the *full graph* feature.
|
||||
* This is an early experimental stage and may involve various unknown issues.
|
||||
* A known problem is that capturing too many batch sizes can lead to OOM
|
||||
* (Out of Memory) errors or inference hangs. If you encounter such issues,
|
||||
* consider reducing `gpu_memory_utilization` or manually specifying a smaller
|
||||
* batch size for graph capture.
|
||||
* For more details, please refer to:
|
||||
* https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
|
||||
**********************************************************************************\033[0m
|
||||
"""
|
||||
logger.warning(warning_message)
|
||||
else:
|
||||
logger.info(
|
||||
"%s cudagraph_mode is not support on NPU. falling back to NONE",
|
||||
compilation_config.cudagraph_mode)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
||||
logger.info(
|
||||
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
assert compilation_config.mode == CompilationMode.VLLM_COMPILE, \
|
||||
"When enabling VLLM_COMPILE aclgraph, please make sure compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_config.cudagraph_mode == CUDAGraphMode.VLLM_COMPILE"
|
||||
compilation_config.set_splitting_ops_for_v1()
|
||||
compilation_config.use_inductor = False
|
||||
compilation_config.splitting_ops.extend(["vllm::mla_forward"])
|
||||
update_aclgraph_sizes(vllm_config)
|
||||
elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
|
||||
compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
|
||||
logger.info(
|
||||
"FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
compilation_config.use_inductor = False
|
||||
warning_message = """\033[91m
|
||||
**********************************************************************************
|
||||
* WARNING: You have enabled the *full graph* feature.
|
||||
* This is an early experimental stage and may involve various unknown issues.
|
||||
* A known problem is that capturing too many batch sizes can lead to OOM
|
||||
* (Out of Memory) errors or inference hangs. If you encounter such issues,
|
||||
* consider reducing `gpu_memory_utilization` or manually specifying a smaller
|
||||
* batch size for graph capture.
|
||||
* For more details, please refer to:
|
||||
* https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
|
||||
**********************************************************************************\033[0m
|
||||
"""
|
||||
logger.warning(warning_message)
|
||||
else:
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
||||
logger.info(
|
||||
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
assert compilation_config.mode == CompilationMode.VLLM_COMPILE, \
|
||||
"When enabling VLLM_COMPILE aclgraph, please make sure compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_config.cudagraph_mode == CUDAGraphMode.VLLM_COMPILE"
|
||||
compilation_config.set_splitting_ops_for_v1()
|
||||
compilation_config.use_inductor = False
|
||||
compilation_config.splitting_ops.extend(["vllm::mla_forward"])
|
||||
update_aclgraph_sizes(vllm_config)
|
||||
elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
|
||||
compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
|
||||
logger.info(
|
||||
"FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
compilation_config.use_inductor = False
|
||||
warning_message = """\033[91m
|
||||
**********************************************************************************
|
||||
* WARNING: You have enabled the *full graph* feature.
|
||||
* This is an early experimental stage and may involve various unknown issues.
|
||||
* A known problem is that capturing too many batch sizes can lead to OOM
|
||||
* (Out of Memory) errors or inference hangs. If you encounter such issues,
|
||||
* consider reducing `gpu_memory_utilization` or manually specifying a smaller
|
||||
* batch size for graph capture.
|
||||
* For more details, please refer to:
|
||||
* https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
|
||||
**********************************************************************************\033[0m
|
||||
"""
|
||||
logger.warning(warning_message)
|
||||
else:
|
||||
logger.info(
|
||||
"%s cudagraph_mode is not support on NPU. falling back to NONE",
|
||||
compilation_config.cudagraph_mode)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
logger.info(
|
||||
"%s cudagraph_mode is not support on NPU. falling back to NONE",
|
||||
compilation_config.cudagraph_mode)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
# TODO: Remove this check when ACL Graph supports ASCEND_LAUNCH_BLOCKING=1
|
||||
# Then, we will have to discuss the error handling strategy and user experience
|
||||
@@ -315,10 +253,7 @@ class NPUPlatform(Platform):
|
||||
|
||||
if parallel_config and parallel_config.worker_cls == "auto":
|
||||
# TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
|
||||
if vllm_version_is("0.11.0"):
|
||||
os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
|
||||
else:
|
||||
parallel_config.all2all_backend = "flashinfer_all2allv"
|
||||
parallel_config.all2all_backend = "flashinfer_all2allv"
|
||||
if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
|
||||
parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
|
||||
else:
|
||||
@@ -443,10 +378,7 @@ class NPUPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
if vllm_version_is("0.11.0"):
|
||||
return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110"
|
||||
else:
|
||||
return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"
|
||||
return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"
|
||||
|
||||
@classmethod
|
||||
def get_current_memory_usage(cls,
|
||||
|
||||
@@ -19,20 +19,14 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.config import CompilationMode, get_current_vllm_config
|
||||
from vllm.distributed import get_ep_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, is_enable_nz,
|
||||
vllm_version_is)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config import CompilationLevel
|
||||
else:
|
||||
from vllm.config import CompilationMode
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
|
||||
|
||||
|
||||
class AscendW8A8DynamicLinearMethod:
|
||||
@@ -129,18 +123,10 @@ class AscendW8A8DynamicFusedMoEMethod:
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
ascend_config = get_ascend_config()
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.use_aclgraph = (
|
||||
vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE
|
||||
and not vllm_config.model_config.enforce_eager
|
||||
and not ascend_config.torchair_graph_config.enabled)
|
||||
else:
|
||||
self.use_aclgraph = (
|
||||
vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE
|
||||
and not vllm_config.model_config.enforce_eager
|
||||
and not ascend_config.torchair_graph_config.enabled)
|
||||
self.use_aclgraph = (
|
||||
vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||
and not vllm_config.model_config.enforce_eager
|
||||
and not ascend_config.torchair_graph_config.enabled)
|
||||
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
self.in_dtype = vllm_config.model_config.dtype
|
||||
|
||||
@@ -6,16 +6,10 @@ import torch.nn as nn
|
||||
import vllm.v1.sample.rejection_sampler as rs
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.sample.rejection_sampler import (RejectionSampler,
|
||||
apply_sampling_constraints,
|
||||
generate_uniform_probs)
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.v1.sample.rejection_sampler import compute_probs
|
||||
else:
|
||||
from vllm.v1.sample.rejection_sampler import apply_sampling_constraints
|
||||
|
||||
PLACEHOLDER_TOKEN_ID = -1
|
||||
GREEDY_TEMPERATURE = -1
|
||||
# Maximum number of speculative draft tokens allowed per request in a single
|
||||
@@ -89,19 +83,12 @@ class AscendRejectionSampler(RejectionSampler, nn.Module):
|
||||
# [num_tokens, vocab_size]
|
||||
# NOTE(woosuk): `target_logits` can be updated in place inside the
|
||||
# `compute_probs` function.
|
||||
if vllm_version_is("0.11.0"):
|
||||
target_probs = compute_probs(
|
||||
target_logits,
|
||||
metadata.cu_num_draft_tokens,
|
||||
sampling_metadata,
|
||||
)
|
||||
else:
|
||||
target_logits = apply_sampling_constraints(
|
||||
target_logits,
|
||||
metadata.cu_num_draft_tokens,
|
||||
sampling_metadata,
|
||||
)
|
||||
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
|
||||
target_logits = apply_sampling_constraints(
|
||||
target_logits,
|
||||
metadata.cu_num_draft_tokens,
|
||||
sampling_metadata,
|
||||
)
|
||||
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
|
||||
|
||||
output_token_ids = rejection_sample(
|
||||
metadata.draft_token_ids,
|
||||
|
||||
@@ -5,13 +5,15 @@ import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config
|
||||
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
|
||||
get_layers_from_vllm_config)
|
||||
from vllm.distributed.parallel_state import get_pp_group
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models import supports_multimodal
|
||||
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
@@ -22,14 +24,6 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
|
||||
AscendMetadata)
|
||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.utils import is_pin_memory_available
|
||||
else:
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
PADDING_SLOT_ID = -1
|
||||
|
||||
@@ -52,16 +46,9 @@ class EagleProposer(Proposer):
|
||||
self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size(
|
||||
)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.use_cuda_graph = (
|
||||
self.vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE
|
||||
and not self.vllm_config.model_config.enforce_eager)
|
||||
else:
|
||||
self.use_cuda_graph = (
|
||||
self.vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE
|
||||
and not self.vllm_config.model_config.enforce_eager)
|
||||
self.use_cuda_graph = (self.vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE and
|
||||
not self.vllm_config.model_config.enforce_eager)
|
||||
|
||||
self.cudagraph_batch_sizes = list(
|
||||
reversed(
|
||||
|
||||
@@ -15,14 +15,7 @@ from vllm.model_executor.model_loader.utils import \
|
||||
process_weights_after_loading
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
|
||||
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
|
||||
CommonAttentionMetadata)
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
@@ -39,31 +32,21 @@ from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
|
||||
update_mla_attn_params)
|
||||
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
|
||||
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
|
||||
prefill_context_parallel_enable,
|
||||
vllm_version_is)
|
||||
prefill_context_parallel_enable)
|
||||
|
||||
if prefill_context_parallel_enable():
|
||||
from vllm.distributed import get_pcp_group
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.utils import is_pin_memory_available
|
||||
else:
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
PADDING_SLOT_ID = -1
|
||||
|
||||
_deepseek_mtp_path = "vllm.model_executor.models.deepseek_mtp"
|
||||
_deepseek_mtp_model = "DeepSeekMTP"
|
||||
if vllm_version_is("0.11.0"):
|
||||
_deepseek_mtp_path = "vllm_ascend.patch.worker.patch_deepseek_mtp"
|
||||
_deepseek_mtp_model = "AscendDeepSeekMTP"
|
||||
|
||||
_MTP_MODELS = {
|
||||
"DeepseekV3ForCausalLM": (_deepseek_mtp_path, _deepseek_mtp_model),
|
||||
"DeepseekV3ForCausalLM":
|
||||
("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"),
|
||||
"Qwen3NextForCausalLM":
|
||||
("vllm_ascend.models.qwen3_next_mtp", "CustomQwen3NextMTP")
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.config import CacheConfig, CompilationMode, VllmConfig
|
||||
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
|
||||
from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
|
||||
get_tp_group)
|
||||
@@ -55,12 +55,6 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
|
||||
init_metadata_for_sp)
|
||||
from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config import CompilationLevel
|
||||
else:
|
||||
from vllm.config import CompilationMode
|
||||
|
||||
|
||||
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
||||
@@ -299,16 +293,10 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
|
||||
layer_idx = extract_layer_index(prefix)
|
||||
mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
|
||||
config.mlp_only_layers)
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.use_aclgraph = (vllm_config is not None
|
||||
and vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE and
|
||||
not vllm_config.model_config.enforce_eager)
|
||||
else:
|
||||
self.use_aclgraph = (vllm_config is not None
|
||||
and vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE and
|
||||
not vllm_config.model_config.enforce_eager)
|
||||
self.use_aclgraph = (vllm_config is not None
|
||||
and vllm_config.compilation_config.mode
|
||||
== CompilationMode.VLLM_COMPILE
|
||||
and not vllm_config.model_config.enforce_eager)
|
||||
if (layer_idx not in mlp_only_layers) and (
|
||||
config.num_experts > 0 and
|
||||
(layer_idx + 1) % config.decoder_sparse_step == 0):
|
||||
|
||||
@@ -32,6 +32,7 @@ import torch_npu
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.config import CacheConfig, ModelConfig, VllmConfig
|
||||
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
@@ -74,12 +75,7 @@ from vllm_ascend.quantization.quant_config import AscendLinearMethod
|
||||
from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
|
||||
from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \
|
||||
TorchairAscendW8A8DynamicLinearMethod
|
||||
from vllm_ascend.utils import dispose_tensor, oproj_tp_enable, vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.attention import Attention
|
||||
else:
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm_ascend.utils import dispose_tensor, oproj_tp_enable
|
||||
|
||||
|
||||
class Indexer(nn.Module):
|
||||
@@ -616,67 +612,31 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
|
||||
# k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
|
||||
# i.e.
|
||||
# kv_lora_rank + qk_rope_head_dim == head_size
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.mla_attn = Attention(
|
||||
num_heads=self.num_local_heads,
|
||||
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
|
||||
scale=self.scaling,
|
||||
num_kv_heads=1,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_mla=True,
|
||||
use_sparse=False,
|
||||
indexer=None,
|
||||
# SFA Args
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
qk_head_dim=self.qk_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
rotary_emb=self.rotary_emb,
|
||||
q_a_proj=self.q_a_proj
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_a_layernorm=self.q_a_layernorm
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_proj=self.q_proj
|
||||
if self.q_lora_rank is None else self.q_b_proj,
|
||||
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=self.kv_a_layernorm,
|
||||
kv_b_proj=self.kv_b_proj,
|
||||
o_proj=self.o_proj,
|
||||
decoder_layer=decoder_layer,
|
||||
)
|
||||
else:
|
||||
self.mla_attn = MLAAttention(
|
||||
num_heads=self.num_local_heads,
|
||||
scale=self.scaling,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_sparse=False,
|
||||
indexer=None,
|
||||
# MLA Args
|
||||
rotary_emb=self.rotary_emb,
|
||||
q_a_proj=self.q_a_proj
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_a_layernorm=self.q_a_layernorm
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_proj=self.q_proj
|
||||
if self.q_lora_rank is None else self.q_b_proj,
|
||||
q_b_proj=self.q_b_proj
|
||||
if self.q_lora_rank is not None else None,
|
||||
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=self.kv_a_layernorm,
|
||||
kv_b_proj=self.kv_b_proj,
|
||||
o_proj=self.o_proj,
|
||||
)
|
||||
self.mla_attn = MLAAttention(
|
||||
num_heads=self.num_local_heads,
|
||||
scale=self.scaling,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_sparse=False,
|
||||
indexer=None,
|
||||
# MLA Args
|
||||
rotary_emb=self.rotary_emb,
|
||||
q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None,
|
||||
q_a_layernorm=self.q_a_layernorm
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
|
||||
q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
|
||||
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=self.kv_a_layernorm,
|
||||
kv_b_proj=self.kv_b_proj,
|
||||
o_proj=self.o_proj,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -882,66 +842,30 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
|
||||
index_topk=self.index_topk,
|
||||
prefix=f"{prefix}.indexer",
|
||||
)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.sfa_attn = Attention(
|
||||
num_heads=self.num_local_heads,
|
||||
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
|
||||
scale=self.scaling,
|
||||
num_kv_heads=1,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_mla=True,
|
||||
use_sparse=True,
|
||||
indexer=self.indexer,
|
||||
# SFA Args
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
qk_head_dim=self.qk_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
rotary_emb=self.rotary_emb,
|
||||
q_a_proj=self.q_a_proj
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_a_layernorm=self.q_a_layernorm
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_proj=self.q_proj
|
||||
if self.q_lora_rank is None else self.q_b_proj,
|
||||
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=self.kv_a_layernorm,
|
||||
kv_b_proj=self.kv_b_proj,
|
||||
o_proj=self.o_proj,
|
||||
decoder_layer=decoder_layer,
|
||||
)
|
||||
else:
|
||||
self.sfa_attn = MLAAttention(
|
||||
num_heads=self.num_local_heads,
|
||||
scale=self.scaling,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_sparse=True,
|
||||
indexer=self.indexer,
|
||||
# MLA Args
|
||||
rotary_emb=self.rotary_emb,
|
||||
q_a_proj=self.q_a_proj
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_a_layernorm=self.q_a_layernorm
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_proj=self.q_proj
|
||||
if self.q_lora_rank is None else self.q_b_proj,
|
||||
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=self.kv_a_layernorm,
|
||||
kv_b_proj=self.kv_b_proj,
|
||||
o_proj=self.o_proj,
|
||||
)
|
||||
self.sfa_attn = MLAAttention(
|
||||
num_heads=self.num_local_heads,
|
||||
scale=self.scaling,
|
||||
qk_nope_head_dim=self.qk_nope_head_dim,
|
||||
qk_rope_head_dim=self.qk_rope_head_dim,
|
||||
v_head_dim=self.v_head_dim,
|
||||
q_lora_rank=self.q_lora_rank,
|
||||
kv_lora_rank=self.kv_lora_rank,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_sparse=True,
|
||||
indexer=self.indexer,
|
||||
# MLA Args
|
||||
rotary_emb=self.rotary_emb,
|
||||
q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None,
|
||||
q_a_layernorm=self.q_a_layernorm
|
||||
if self.q_lora_rank is not None else None,
|
||||
q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
|
||||
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
|
||||
kv_a_layernorm=self.kv_a_layernorm,
|
||||
kv_b_proj=self.kv_b_proj,
|
||||
o_proj=self.o_proj,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@@ -53,8 +53,7 @@ from vllm_ascend.torchair.utils import (get_all_reduce_merge_state,
|
||||
super_kernel)
|
||||
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
|
||||
get_ascend_soc_version, is_310p,
|
||||
is_hierarchical_communication_enabled,
|
||||
vllm_version_is)
|
||||
is_hierarchical_communication_enabled)
|
||||
|
||||
|
||||
def torchair_fused_experts_with_mc2(
|
||||
@@ -1069,12 +1068,8 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
else:
|
||||
# init moe.
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
else:
|
||||
self.local_num_experts, self.expert_map, _ = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
self.local_num_experts, self.expert_map, _ = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
# dynamic eplb initializing with not expert_map_path
|
||||
if self.dynamic_eplb:
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
|
||||
@@ -26,13 +26,7 @@ from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer,
|
||||
AttentionType)
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
|
||||
AscendAttentionMetadataBuilder,
|
||||
|
||||
@@ -12,13 +12,7 @@ from vllm.config import VllmConfig, get_current_vllm_config
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
UnquantizedLinearMethod)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv, round_down
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv, round_down
|
||||
from vllm.utils.math_utils import cdiv, round_down
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
|
||||
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
from vllm.model_executor.model_loader.utils import \
|
||||
process_weights_after_loading
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
@@ -23,13 +24,7 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
|
||||
TorchairDeepSeekMTP
|
||||
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
||||
TorchairCommonAttentionMetadata)
|
||||
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
|
||||
vllm_version_is)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
else:
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
||||
|
||||
PADDING_SLOT_ID = -1
|
||||
|
||||
|
||||
@@ -12,13 +12,7 @@ from vllm.config import VllmConfig, get_current_vllm_config
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
UnquantizedLinearMethod)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv, round_down
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv, round_down
|
||||
from vllm.utils.math_utils import cdiv, round_down
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
|
||||
@@ -412,33 +412,21 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
|
||||
Check whether it is vLLM default capture sizes.
|
||||
"""
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
cudagraph_capture_sizes = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
else:
|
||||
max_cudagraph_capture_size = \
|
||||
vllm_config.compilation_config.max_cudagraph_capture_size
|
||||
cudagraph_capture_sizes = [
|
||||
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
|
||||
]
|
||||
if max_cudagraph_capture_size >= 8:
|
||||
# Step size 8 for small batch sizes, up to 256(not included)
|
||||
cudagraph_capture_sizes += list(
|
||||
range(8, min(max_cudagraph_capture_size + 1, 256), 8))
|
||||
if max_cudagraph_capture_size >= 256:
|
||||
# Step size 16 for larger batch sizes
|
||||
cudagraph_capture_sizes += list(
|
||||
range(256, max_cudagraph_capture_size + 1, 16))
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
|
||||
reverse=True)
|
||||
else:
|
||||
# in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
|
||||
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
|
||||
max_cudagraph_capture_size = \
|
||||
vllm_config.compilation_config.max_cudagraph_capture_size
|
||||
cudagraph_capture_sizes = [
|
||||
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
|
||||
]
|
||||
if max_cudagraph_capture_size >= 8:
|
||||
# Step size 8 for small batch sizes, up to 256(not included)
|
||||
cudagraph_capture_sizes += list(
|
||||
range(8, min(max_cudagraph_capture_size + 1, 256), 8))
|
||||
if max_cudagraph_capture_size >= 256:
|
||||
# Step size 16 for larger batch sizes
|
||||
cudagraph_capture_sizes += list(
|
||||
range(256, max_cudagraph_capture_size + 1, 16))
|
||||
# in newer version, vLLM use ascending order of cudagraph_capture_sizes.
|
||||
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
|
||||
if target_cudagraph_capture_sizes == \
|
||||
vllm_config.compilation_config.cudagraph_capture_sizes:
|
||||
return True
|
||||
@@ -465,21 +453,13 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
|
||||
and vllm_config.parallel_config.tensor_parallel_size == 1 \
|
||||
and vllm_config.parallel_config.data_parallel_size > 1 :
|
||||
if vllm_version_is("0.11.0"):
|
||||
max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
|
||||
else:
|
||||
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
|
||||
|
||||
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
|
||||
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
|
||||
i for i in range(24, max_capture_size + 1, 8)
|
||||
]
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
|
||||
vllm_config.compilation_config.init_with_cudagraph_sizes(
|
||||
new_cudagraph_capture_sizes)
|
||||
else:
|
||||
update_cudagraph_capture_sizes(vllm_config,
|
||||
new_cudagraph_capture_sizes)
|
||||
update_cudagraph_capture_sizes(vllm_config,
|
||||
new_cudagraph_capture_sizes)
|
||||
|
||||
|
||||
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
@@ -573,10 +553,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
indices[0], indices[-1] = 0, len(original_sizes) - 1
|
||||
|
||||
sampled_sizes = [original_sizes[i] for i in indices]
|
||||
if vllm_version_is("0.11.0"):
|
||||
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
|
||||
else:
|
||||
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
|
||||
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
|
||||
|
||||
logger.info(
|
||||
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
|
||||
@@ -607,10 +584,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
|
||||
enlarged_sizes = [(num_speculative_tokens + 1) * size
|
||||
for size in original_sizes]
|
||||
if vllm_version_is("0.11.0"):
|
||||
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
|
||||
else:
|
||||
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
|
||||
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
|
||||
logger.info(
|
||||
"Adjusted ACL graphs: %s → %s for speculative decoding",
|
||||
original_sizes, enlarged_sizes)
|
||||
@@ -719,11 +693,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
|
||||
"GemmaRMSNorm": AscendGemmaRMSNorm,
|
||||
"FusedMoE": AscendFusedMoE,
|
||||
"SharedFusedMoE": AscendSharedFusedMoE,
|
||||
"MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention,
|
||||
}
|
||||
mla_to_register = "MultiHeadLatentAttention" if vllm_version_is(
|
||||
"0.11.0") else "MultiHeadLatentAttentionWrapper"
|
||||
if vllm_config and vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
REGISTERED_ASCEND_OPS[mla_to_register] = AscendMultiHeadLatentAttention
|
||||
|
||||
for name, op_cls in REGISTERED_ASCEND_OPS.items():
|
||||
CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)
|
||||
|
||||
@@ -3,13 +3,7 @@ from typing import Optional, Union
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.distributed import get_dcp_group
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm_ascend.utils import prefill_context_parallel_enable
|
||||
|
||||
|
||||
@@ -41,10 +41,11 @@ import torch.nn as nn
|
||||
from tqdm import tqdm # type: ignore
|
||||
from vllm.attention import AttentionType, get_attn_backend
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.layer import Attention, MLAAttention
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.monitor import set_cudagraph_capturing_enabled
|
||||
from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config
|
||||
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
|
||||
get_layers_from_vllm_config)
|
||||
from vllm.distributed import tensor_model_parallel_all_gather
|
||||
from vllm.distributed.kv_transfer import (get_kv_transfer_group,
|
||||
has_kv_transfer_group)
|
||||
@@ -58,8 +59,6 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.mamba.abstract import MambaBase
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.interfaces import (SupportsMultiModal,
|
||||
supports_mrope,
|
||||
supports_transcription)
|
||||
@@ -73,29 +72,23 @@ from vllm.sampling_params import SamplingType
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import cdiv
|
||||
else:
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
|
||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
AttentionCGSupport, CommonAttentionMetadata,
|
||||
reorder_batch_to_split_decodes_and_prefills)
|
||||
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec,
|
||||
EncoderOnlyAttentionSpec,
|
||||
FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheSpec,
|
||||
MambaSpec, MLAAttentionSpec,
|
||||
UniformTypeKVCacheSpecs)
|
||||
# yapf: enable
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
DraftTokenIds, LogprobsTensors, ModelRunnerOutput,
|
||||
PoolerOutput)
|
||||
@@ -119,6 +112,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
|
||||
AscendPrefillContextParallelMetadata)
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
|
||||
set_graph_params,
|
||||
@@ -147,8 +141,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||
AscendSocVersion, ProfileExecuteDuration,
|
||||
enable_sp, get_ascend_soc_version, is_310p,
|
||||
is_enable_nz, is_moe_model, lmhead_tp_enable,
|
||||
prefill_context_parallel_enable,
|
||||
vllm_version_is)
|
||||
prefill_context_parallel_enable)
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if prefill_context_parallel_enable():
|
||||
@@ -157,27 +150,6 @@ if prefill_context_parallel_enable():
|
||||
get_prefill_context_model_parallel_rank,
|
||||
get_prefill_context_model_parallel_world_size)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
get_dtype_size)
|
||||
else:
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.utils import LazyLoader, is_pin_memory_available
|
||||
|
||||
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
|
||||
else:
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
@@ -637,11 +609,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
diagonal=1).to(self.device)
|
||||
if get_pp_group().is_last_rank:
|
||||
self.drafter = self._get_drafter()
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.rejection_sampler = AscendRejectionSampler()
|
||||
else:
|
||||
self.rejection_sampler = AscendRejectionSampler(
|
||||
self.sampler)
|
||||
self.rejection_sampler = AscendRejectionSampler(self.sampler)
|
||||
self.actual_seq_lengths_q = list(
|
||||
range(self.decode_token_per_req, self.max_num_tokens + 1,
|
||||
self.decode_token_per_req))
|
||||
@@ -664,11 +632,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
|
||||
# the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
|
||||
if self.compilation_config.cudagraph_capture_sizes:
|
||||
if vllm_version_is("0.11.0"):
|
||||
max_num_tokens = self.compilation_config.cudagraph_capture_sizes[
|
||||
0]
|
||||
else:
|
||||
max_num_tokens = self.compilation_config.max_cudagraph_capture_size
|
||||
max_num_tokens = self.compilation_config.max_cudagraph_capture_size
|
||||
else:
|
||||
# NOTE: To save memory, we cap the max number of tokens to 512.
|
||||
max_num_tokens = min(
|
||||
@@ -717,10 +681,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
|
||||
|
||||
def _use_aclgraph(self) -> bool:
|
||||
if vllm_version_is("0.11.0"):
|
||||
return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager
|
||||
else:
|
||||
return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.mode == CompilationMode.VLLM_COMPILE and not self.model_config.enforce_eager
|
||||
return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.mode == CompilationMode.VLLM_COMPILE and not self.model_config.enforce_eager
|
||||
|
||||
def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
|
||||
# Remove finished requests from the cached states.
|
||||
@@ -914,9 +875,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
if mm_input.get("use_audio_in_video") is True:
|
||||
use_audio_in_video = True
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
if supports_mrope(self.model):
|
||||
req_state.mrope_positions, req_state.mrope_position_delta = \
|
||||
MRotaryEmbedding.get_input_positions_tensor(
|
||||
self.model.get_mrope_input_positions(
|
||||
req_state.prompt_token_ids,
|
||||
hf_config=self.model_config.hf_config,
|
||||
image_grid_thw=image_grid_thw,
|
||||
@@ -925,18 +886,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
audio_feature_lengths=audio_feature_lengths,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
else:
|
||||
if supports_mrope(self.model):
|
||||
req_state.mrope_positions, req_state.mrope_position_delta = \
|
||||
self.model.get_mrope_input_positions(
|
||||
req_state.prompt_token_ids,
|
||||
hf_config=self.model_config.hf_config,
|
||||
image_grid_thw=image_grid_thw,
|
||||
video_grid_thw=video_grid_thw,
|
||||
second_per_grid_ts=second_per_grid_ts,
|
||||
audio_feature_lengths=audio_feature_lengths,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
|
||||
def _sync_metadata_across_dp(
|
||||
self, num_tokens: int,
|
||||
@@ -1108,21 +1057,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
|
||||
scheduler_output)
|
||||
encoder_outputs = []
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
mm_inputs = group_mm_kwargs_by_modality(
|
||||
mm_kwargs,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
)
|
||||
else:
|
||||
model = cast(SupportsMultiModal, self.model)
|
||||
mm_inputs = group_mm_kwargs_by_modality(
|
||||
mm_kwargs,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
merge_by_field_config=model.merge_by_field_config,
|
||||
)
|
||||
model = cast(SupportsMultiModal, self.model)
|
||||
mm_inputs = group_mm_kwargs_by_modality(
|
||||
mm_kwargs,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
merge_by_field_config=model.merge_by_field_config,
|
||||
)
|
||||
for modality, num_items, mm_kwargs_group in mm_inputs:
|
||||
# Run the encoder.
|
||||
# `curr_group_outputs` is either of the following:
|
||||
@@ -1181,56 +1122,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
return mm_kwargs, mm_hashes_pos
|
||||
|
||||
def _gather_mm_embeddings_0110(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> list[torch.Tensor]:
|
||||
|
||||
def _iter_mm_features(req_state: CachedRequestState):
|
||||
assert req_state.mm_features is not None
|
||||
for mm_feature in req_state.mm_features:
|
||||
pos_info = mm_feature.mm_position
|
||||
yield mm_feature.identifier, pos_info, getattr(
|
||||
pos_info, "is_embed", None)
|
||||
|
||||
mm_embeds: list[torch.Tensor] = []
|
||||
|
||||
for req_id in self.input_batch.req_ids:
|
||||
num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
|
||||
req_id]
|
||||
req_state = self.requests[req_id]
|
||||
num_computed_tokens = req_state.num_computed_tokens
|
||||
|
||||
for mm_hash, pos_info, is_embed in _iter_mm_features(req_state):
|
||||
start_pos = pos_info.offset
|
||||
num_encoder_tokens = pos_info.length
|
||||
|
||||
if start_pos >= num_computed_tokens + num_scheduled_tokens:
|
||||
break
|
||||
if start_pos + num_encoder_tokens <= num_computed_tokens:
|
||||
continue
|
||||
|
||||
start_idx = max(num_computed_tokens - start_pos, 0)
|
||||
end_idx = min(
|
||||
num_computed_tokens - start_pos + num_scheduled_tokens,
|
||||
num_encoder_tokens,
|
||||
)
|
||||
assert start_idx < end_idx
|
||||
|
||||
encoder_output = self.encoder_cache.get(mm_hash, None)
|
||||
assert encoder_output is not None, \
|
||||
f"Encoder cache miss for {mm_hash}."
|
||||
|
||||
if is_embed is not None:
|
||||
is_embed = is_embed[start_idx:end_idx]
|
||||
|
||||
mm_embeds_item = gather_mm_placeholders(
|
||||
encoder_output[start_idx:end_idx],
|
||||
is_embed=is_embed,
|
||||
)
|
||||
mm_embeds.append(mm_embeds_item)
|
||||
return mm_embeds
|
||||
|
||||
def _gather_mm_embeddings(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
@@ -1730,22 +1621,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# embeddings), we always use embeddings (rather than token ids)
|
||||
# as input to the multimodal model, even when the input is text.
|
||||
input_ids = self.input_ids[:total_num_scheduled_tokens]
|
||||
if vllm_version_is("0.11.0"):
|
||||
mm_embeds = self._gather_mm_embeddings_0110(scheduler_output)
|
||||
if mm_embeds:
|
||||
inputs_embeds = self.model.get_input_embeddings(
|
||||
input_ids, mm_embeds)
|
||||
else:
|
||||
inputs_embeds = self.model.get_input_embeddings(input_ids)
|
||||
else:
|
||||
mm_embeds, is_mm_embed = self._gather_mm_embeddings(
|
||||
scheduler_output)
|
||||
mm_embeds, is_mm_embed = self._gather_mm_embeddings(
|
||||
scheduler_output)
|
||||
|
||||
inputs_embeds = self.model.get_input_embeddings(
|
||||
input_ids,
|
||||
multimodal_embeddings=mm_embeds,
|
||||
is_multimodal=is_mm_embed,
|
||||
)
|
||||
inputs_embeds = self.model.get_input_embeddings(
|
||||
input_ids,
|
||||
multimodal_embeddings=mm_embeds,
|
||||
is_multimodal=is_mm_embed,
|
||||
)
|
||||
|
||||
# TODO(woosuk): Avoid the copy. Optimize.
|
||||
self.inputs_embeds.gpu[:total_num_scheduled_tokens].copy_(
|
||||
@@ -2151,9 +2034,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# TODO: Optimize the CPU -> NPU copy.
|
||||
cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
|
||||
self.device, non_blocking=True)
|
||||
if not vllm_version_is("0.11.0"):
|
||||
cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
|
||||
self.device, non_blocking=True)
|
||||
cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
|
||||
self.device, non_blocking=True)
|
||||
logits_indices = torch.from_numpy(logits_indices).to(self.device,
|
||||
non_blocking=True)
|
||||
target_logits_indices = torch.from_numpy(target_logits_indices).to(
|
||||
@@ -2167,25 +2049,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
draft_token_ids = draft_token_ids[target_logits_indices + 1]
|
||||
if self.pcp_size > 1:
|
||||
logits_indices = logits_indices_pcp
|
||||
if vllm_version_is("0.11.0"):
|
||||
metadata = SpecDecodeMetadata(
|
||||
draft_token_ids=draft_token_ids,
|
||||
num_draft_tokens=num_draft_tokens.tolist(),
|
||||
cu_num_draft_tokens=cu_num_draft_tokens,
|
||||
target_logits_indices=target_logits_indices,
|
||||
bonus_logits_indices=bonus_logits_indices,
|
||||
logits_indices=logits_indices,
|
||||
)
|
||||
else:
|
||||
metadata = SpecDecodeMetadata(
|
||||
draft_token_ids=draft_token_ids,
|
||||
num_draft_tokens=num_draft_tokens.tolist(),
|
||||
cu_num_draft_tokens=cu_num_draft_tokens,
|
||||
cu_num_sampled_tokens=cu_num_sampled_tokens,
|
||||
target_logits_indices=target_logits_indices,
|
||||
bonus_logits_indices=bonus_logits_indices,
|
||||
logits_indices=logits_indices,
|
||||
)
|
||||
metadata = SpecDecodeMetadata(
|
||||
draft_token_ids=draft_token_ids,
|
||||
num_draft_tokens=num_draft_tokens.tolist(),
|
||||
cu_num_draft_tokens=cu_num_draft_tokens,
|
||||
cu_num_sampled_tokens=cu_num_sampled_tokens,
|
||||
target_logits_indices=target_logits_indices,
|
||||
bonus_logits_indices=bonus_logits_indices,
|
||||
logits_indices=logits_indices,
|
||||
)
|
||||
return metadata
|
||||
|
||||
def apply_grammar_bitmask(
|
||||
@@ -2222,33 +2094,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
shape=(logits.shape[0],
|
||||
grammar_bitmask.shape[1]))
|
||||
cumulative_index = 0
|
||||
if vllm_version_is("0.11.0"):
|
||||
seq = sorted(
|
||||
scheduler_output.structured_output_request_ids.items(),
|
||||
key=lambda x: x[1])
|
||||
for req_id, _ in seq:
|
||||
for req_id in scheduler_output.structured_output_request_ids:
|
||||
num_spec_tokens = len(
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
|
||||
if req_id in struct_out_req_batch_indices:
|
||||
logit_index = struct_out_req_batch_indices[req_id]
|
||||
num_spec_tokens = len(
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(
|
||||
req_id, []))
|
||||
for i in range(1 + num_spec_tokens):
|
||||
sorted_bitmask[logit_index + i] = \
|
||||
grammar_bitmask[cumulative_index + i]
|
||||
sorted_bitmask[logit_index +
|
||||
i] = grammar_bitmask[cumulative_index + i]
|
||||
out_indices.append(logit_index + i)
|
||||
cumulative_index += 1 + num_spec_tokens
|
||||
else:
|
||||
for req_id in scheduler_output.structured_output_request_ids:
|
||||
num_spec_tokens = len(
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(
|
||||
req_id, []))
|
||||
if req_id in struct_out_req_batch_indices:
|
||||
logit_index = struct_out_req_batch_indices[req_id]
|
||||
for i in range(1 + num_spec_tokens):
|
||||
sorted_bitmask[logit_index +
|
||||
i] = grammar_bitmask[cumulative_index +
|
||||
i]
|
||||
out_indices.append(logit_index + i)
|
||||
cumulative_index += 1 + num_spec_tokens
|
||||
cumulative_index += 1 + num_spec_tokens
|
||||
grammar_bitmask = sorted_bitmask
|
||||
|
||||
# Serialization of np.ndarray is much more efficient than a tensor,
|
||||
@@ -2518,14 +2373,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
logits = model_output_broadcast_data["logits"]
|
||||
|
||||
# Apply structured output bitmasks if present
|
||||
if vllm_version_is("0.11.0"):
|
||||
if scheduler_output.grammar_bitmask is not None:
|
||||
logits = self.apply_grammar_bitmask(
|
||||
scheduler_output, logits)
|
||||
else:
|
||||
if scheduler_output.structured_output_request_ids:
|
||||
logits = self.apply_grammar_bitmask(
|
||||
scheduler_output, logits)
|
||||
if scheduler_output.structured_output_request_ids:
|
||||
logits = self.apply_grammar_bitmask(scheduler_output, logits)
|
||||
|
||||
with ProfileExecuteDuration().capture_async("Sample"):
|
||||
# Sample the next token and get logprobs if needed.
|
||||
@@ -3837,95 +3686,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
else:
|
||||
self.reorder_batch_threshold = reorder_batch_threshold_i
|
||||
|
||||
def get_kv_cache_spec_v0110(self) -> dict[str, KVCacheSpec]:
|
||||
"""
|
||||
Generates the KVCacheSpec by parsing the kv cache format from each
|
||||
Attention module in the static forward context.
|
||||
Returns:
|
||||
KVCacheSpec: A dictionary mapping layer names to their KV cache
|
||||
format. Layers that do not need KV cache are not included.
|
||||
"""
|
||||
|
||||
block_size = self.vllm_config.cache_config.block_size
|
||||
use_mla = self.vllm_config.model_config.use_mla
|
||||
use_sparse = self.use_sparse
|
||||
kv_cache_spec: dict[str, KVCacheSpec] = {}
|
||||
attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
|
||||
for layer_name, attn_module in attn_layers.items():
|
||||
if (kv_tgt_layer :=
|
||||
attn_module.kv_sharing_target_layer_name) is not None:
|
||||
# The layer doesn't need its own KV cache and will use that of
|
||||
# the target layer. We skip creating a KVCacheSpec for it, so
|
||||
# that KV cache management logic will act as this layer does
|
||||
# not exist, and doesn't allocate KV cache for the layer. This
|
||||
# enables the memory saving of cross-layer kv sharing, allowing
|
||||
# a given amount of memory to accommodate longer context lengths
|
||||
# or enable more requests to be processed simultaneously.
|
||||
self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
|
||||
continue
|
||||
if isinstance(attn_module, AscendMultiHeadLatentAttention):
|
||||
continue
|
||||
|
||||
# TODO: Support other attention modules, e.g., cross-attention
|
||||
# TODO(lucas): move the attention specs into the model layers like
|
||||
# the attention backends
|
||||
if attn_module.attn_type == AttentionType.DECODER:
|
||||
if use_mla and not use_sparse:
|
||||
kv_cache_spec[layer_name] = MLAAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=attn_module.num_kv_heads,
|
||||
head_size=attn_module.head_size,
|
||||
dtype=self.kv_cache_dtype,
|
||||
cache_dtype_str=self.cache_config.cache_dtype)
|
||||
else:
|
||||
# TODO(cmq): This is a hack way to fix deepseek kvcache when
|
||||
# using DSA. Fix the spec in vLLM is a finnal way.
|
||||
kv_cache_spec[layer_name] = FullAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=attn_module.num_kv_heads,
|
||||
head_size=attn_module.head_size,
|
||||
dtype=self.kv_cache_dtype)
|
||||
elif attn_module.attn_type in (AttentionType.ENCODER,
|
||||
AttentionType.ENCODER_ONLY):
|
||||
# encoder-only attention does not need KV cache.
|
||||
continue
|
||||
elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown attention type: {attn_module.attn_type}")
|
||||
|
||||
mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
|
||||
if len(mamba_layers) > 0:
|
||||
if (self.vllm_config.speculative_config is not None
|
||||
and self.vllm_config.model_config.hf_config.model_type
|
||||
not in ["qwen3_next"]):
|
||||
raise NotImplementedError(
|
||||
"Mamba with speculative decoding is not supported yet.")
|
||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
||||
raise NotImplementedError(
|
||||
"Prefix caching is not supported for Mamba yet.")
|
||||
max_model_len = self.vllm_config.model_config.max_model_len
|
||||
|
||||
page_size_padded = (
|
||||
self.vllm_config.cache_config.mamba_page_size_padded)
|
||||
|
||||
# Set block_size to max_model_len, so that mamba model will always
|
||||
# have only one block in the KV cache.
|
||||
for layer_name, mamba_module in mamba_layers.items():
|
||||
kv_cache_spec[layer_name] = MambaSpec(
|
||||
shapes=mamba_module.get_state_shape(),
|
||||
dtypes=mamba_module.get_state_dtype(),
|
||||
block_size=max_model_len,
|
||||
page_size_padded=page_size_padded,
|
||||
mamba_type=mamba_module.mamba_type,
|
||||
num_speculative_blocks=(
|
||||
self.speculative_config.num_speculative_tokens
|
||||
if self.speculative_config else 0),
|
||||
)
|
||||
|
||||
return kv_cache_spec
|
||||
|
||||
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
|
||||
"""
|
||||
Generates the KVCacheSpec by parsing the kv cache format from each
|
||||
@@ -3934,9 +3694,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
KVCacheSpec: A dictionary mapping layer names to their KV cache
|
||||
format. Layers that do not need KV cache are not included.
|
||||
"""
|
||||
if vllm_version_is("0.11.0"):
|
||||
return self.get_kv_cache_spec_v0110()
|
||||
|
||||
block_size = self.vllm_config.cache_config.block_size
|
||||
use_mla = self.vllm_config.model_config.use_mla
|
||||
kv_cache_spec: dict[str, KVCacheSpec] = {}
|
||||
|
||||
@@ -30,6 +30,7 @@ from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams, SamplingType
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
from vllm.utils.collection_utils import swap_dict_values
|
||||
from vllm.v1.outputs import LogprobsTensors
|
||||
from vllm.v1.pool.metadata import PoolingMetadata
|
||||
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
|
||||
@@ -39,14 +40,8 @@ from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
|
||||
from vllm.v1.utils import copy_slice
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import swap_dict_values
|
||||
else:
|
||||
from vllm.utils.collection_utils import swap_dict_values
|
||||
|
||||
|
||||
@dataclass
|
||||
class CachedRequestState:
|
||||
|
||||
@@ -35,6 +35,8 @@ from vllm.logger import logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
@@ -50,7 +52,7 @@ from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz,
|
||||
prefill_context_parallel_enable,
|
||||
register_ascend_customop, sleep_mode_enabled,
|
||||
try_register_lib, vllm_version_is)
|
||||
try_register_lib)
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
|
||||
@@ -65,12 +67,6 @@ torch_non_c_binding_in_graph_functions_npu[
|
||||
torch._dynamo.trace_rules.torch_name_rule_map.append(
|
||||
torch_non_c_binding_in_graph_functions_npu) # noqa: E402
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
|
||||
else:
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
class NPUWorker(WorkerBase):
|
||||
|
||||
@@ -141,10 +137,7 @@ class NPUWorker(WorkerBase):
|
||||
|
||||
if self.model_config.trust_remote_code:
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.utils import init_cached_hf_modules
|
||||
else:
|
||||
from vllm.utils.import_utils import init_cached_hf_modules
|
||||
from vllm.utils.import_utils import init_cached_hf_modules
|
||||
|
||||
init_cached_hf_modules()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user