Drop 0.11.0 support (#4377)

There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.


- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-24 17:08:20 +08:00
committed by GitHub
parent 41ddb06554
commit a1f142b7ad
80 changed files with 467 additions and 1755 deletions

View File

@@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version: vllm_version:
required: false required: false
default: "v0.11.0" default: "2918c1b49c88c29783c86f78d2c4221cb9622379"
type: string type: string
description: vllm version to use description: vllm version to use
vllm_ascend_remote_url: vllm_ascend_remote_url:

View File

@@ -51,7 +51,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: v0.11.0 - vllm_branch: 2918c1b49c88c29783c86f78d2c4221cb9622379
vllm_ascend_branch: main vllm_ascend_branch: main
max-parallel: 1 max-parallel: 1
container: container:

View File

@@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0] vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -138,7 +138,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0] vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -69,7 +69,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0] vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -86,7 +86,7 @@ jobs:
tests: tests/e2e/nightly/ops tests: tests/e2e/nightly/ops
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: v0.11.0 vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
@@ -125,7 +125,7 @@ jobs:
- Qwen3-Next-80B-A3B-Instruct - Qwen3-Next-80B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with: with:
vllm: v0.11.0 vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }} model_list: ${{ toJson(matrix.test_config.model_list) }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11

View File

@@ -136,7 +136,7 @@ jobs:
tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: v0.11.0 vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}

View File

@@ -72,7 +72,7 @@ jobs:
- DeepSeek-V2-Lite - DeepSeek-V2-Lite
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with: with:
vllm: v0.11.0 vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
runner: ${{ matrix.runner }} runner: ${{ matrix.runner }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
model_list: ${{ toJson(matrix.model_list) }} model_list: ${{ toJson(matrix.model_list) }}

View File

@@ -46,8 +46,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0 ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # Revert this change once VLLM_TAG is specified to branch or tag
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -37,8 +37,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0 ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # Revert this change once VLLM_TAG is specified to branch or tag
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -34,9 +34,10 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0 ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
# Revert this change once VLLM_TAG is specified to branch or tag
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -45,8 +45,10 @@ RUN apt-get update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0 ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # Revert this change once VLLM_TAG is specified to branch or tag
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -48,9 +48,10 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0 ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
# Revert this change once VLLM_TAG is specified to branch or tag
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -48,9 +48,10 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0 ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379
# Revert this change once VLLM_TAG is specified to branch or tag
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -63,10 +63,6 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel) destroy_distributed_environment, destroy_model_parallel)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"

View File

@@ -67,11 +67,6 @@ from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group) destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from safetensors.torch import load_file from safetensors.torch import load_file
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes, get_open_port
else:
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port

View File

@@ -20,10 +20,6 @@ import os
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"

View File

@@ -67,11 +67,6 @@ from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group) destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from safetensors.torch import load_file from safetensors.torch import load_file
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes, get_open_port
else:
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port

View File

@@ -45,6 +45,7 @@ from vllm.inputs import TextPrompt
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.utils import maybe_model_redirect from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.network_utils import get_open_port
from tests.e2e.model_utils import (TokensTextLogprobs, from tests.e2e.model_utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs) TokensTextLogprobsPromptLogprobs)
@@ -54,12 +55,6 @@ from vllm_ascend.ascend_config import clear_ascend_config
# we not explicitly patch here, some of them might be effectiveless # we not explicitly patch here, some of them might be effectiveless
# in pytest scenario # in pytest scenario
from vllm_ascend.utils import adapt_patch # noqa E402 from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
adapt_patch(True) adapt_patch(True)
adapt_patch(False) adapt_patch(False)

View File

@@ -23,12 +23,6 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
MODELS = [ MODELS = [

View File

@@ -19,14 +19,9 @@ from typing import Any
import openai import openai
import pytest import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer from tests.e2e.conftest import RemoteOpenAIServer
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]

View File

@@ -18,15 +18,10 @@ from typing import Any
import openai import openai
import pytest import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases from tools.aisbench import run_aisbench_cases
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_open_port
else:
from vllm.utils.network_utils import get_open_port
MODELS = [ MODELS = [
"Qwen/Qwen3-32B", "Qwen/Qwen3-32B",

View File

@@ -25,7 +25,6 @@ from vllm.assets.image import ImageAsset
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal from tests.e2e.model_utils import check_outputs_equal
from vllm_ascend.utils import vllm_version_is
MODELS = [ MODELS = [
"OpenGVLab/InternVL2-8B", "OpenGVLab/InternVL2-8B",
@@ -34,13 +33,6 @@ MODELS = [
"OpenGVLab/InternVL3_5-8B", "OpenGVLab/InternVL3_5-8B",
] ]
# skip testing InternVL3-8B and InternVL3_5-8B on 0.11.0 due to https://github.com/vllm-project/vllm-ascend/issues/3925.
if vllm_version_is("0.11.0"):
MODELS = [
"OpenGVLab/InternVL2-8B",
"OpenGVLab/InternVL2_5-8B",
]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
def test_internvl_basic(model: str): def test_internvl_basic(model: str):

View File

@@ -23,16 +23,11 @@ from unittest.mock import patch
import torch import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.utils.mem_constants import GiB_bytes
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
from tests.e2e.utils import fork_new_process_for_each_test from tests.e2e.utils import fork_new_process_for_each_test
from vllm_ascend.device_allocator.camem import CaMemAllocator from vllm_ascend.device_allocator.camem import CaMemAllocator
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
@fork_new_process_for_each_test @fork_new_process_for_each_test

View File

@@ -9,6 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
from vllm.multimodal.inputs import (MultiModalFeatureSpec, from vllm.multimodal.inputs import (MultiModalFeatureSpec,
MultiModalKwargsItem, PlaceholderRange) MultiModalKwargsItem, PlaceholderRange)
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils.hashing import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash) init_none_hash)
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,12 +22,6 @@ from vllm.v1.structured_output import StructuredOutputManager
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B" MODEL = "Qwen3-0.6B"
@@ -181,22 +176,12 @@ class TestAscendScheduler(TestBase):
) )
cache_config.num_gpu_blocks = 10000 cache_config.num_gpu_blocks = 10000
if vllm_version_is("0.11.0"):
scheduler = AscendScheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
structured_output_manager=MagicMock(
spec=StructuredOutputManager),
)
else:
scheduler = AscendScheduler( scheduler = AscendScheduler(
vllm_config=vllm_config, vllm_config=vllm_config,
kv_cache_config=kv_cache_config, kv_cache_config=kv_cache_config,
log_stats=True, log_stats=True,
block_size=block_size, block_size=block_size,
structured_output_manager=MagicMock( structured_output_manager=MagicMock(spec=StructuredOutputManager),
spec=StructuredOutputManager),
) )
should_advance = MagicMock() should_advance = MagicMock()

View File

@@ -13,12 +13,6 @@ from unittest.mock import MagicMock, patch
import msgspec import msgspec
import zmq import zmq
from vllm.distributed.parallel_state import GroupCoordinator from vllm.distributed.parallel_state import GroupCoordinator
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import make_zmq_path
else:
from vllm.utils.network_utils import make_zmq_path from vllm.utils.network_utils import make_zmq_path
fake_engine = types.ModuleType("mooncake.engine") fake_engine = types.ModuleType("mooncake.engine")

View File

@@ -10,6 +10,7 @@ import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig) ModelConfig, SchedulerConfig, VllmConfig)
from vllm.utils.hashing import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash) init_none_hash)
from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.core.sched.scheduler import Scheduler
@@ -19,13 +20,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import sha256
else:
from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
@@ -111,14 +105,7 @@ def create_scheduler(
], ],
) )
vllm_config.cache_config.num_gpu_blocks = num_blocks vllm_config.cache_config.num_gpu_blocks = num_blocks
if vllm_version_is("0.11.0"):
return Scheduler(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
structured_output_manager=StructuredOutputManager(vllm_config),
)
else:
return Scheduler( return Scheduler(
vllm_config=vllm_config, vllm_config=vllm_config,
kv_cache_config=kv_cache_config, kv_cache_config=kv_cache_config,

View File

@@ -22,7 +22,6 @@ import torch
from torch import nn from torch import nn
from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic
from vllm_ascend.utils import vllm_version_is
class DummyDeviceConfig: class DummyDeviceConfig:
@@ -174,11 +173,7 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path):
"vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading", "vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading",
lambda *a, **k: None) lambda *a, **k: None)
# patch get_ip # patch get_ip
if vllm_version_is("0.11.0"): monkeypatch.setattr("vllm.utils.network_utils.get_ip", lambda: "127.0.0.1")
monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1")
else:
monkeypatch.setattr("vllm.utils.network_utils.get_ip",
lambda: "127.0.0.1")
# patch find_free_port # patch find_free_port
monkeypatch.setattr( monkeypatch.setattr(
"vllm_ascend.model_loader.netloader.netloader.find_free_port", "vllm_ascend.model_loader.netloader.netloader.find_free_port",

View File

@@ -9,7 +9,6 @@ from vllm.model_executor.layers.mla import MLAModules
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.models.layers.mla import (AscendMultiHeadLatentAttention, from vllm_ascend.models.layers.mla import (AscendMultiHeadLatentAttention,
IndexerWrapper) IndexerWrapper)
from vllm_ascend.utils import vllm_version_is
class TestIndexerWrapper(TestBase): class TestIndexerWrapper(TestBase):
@@ -85,40 +84,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
"vllm_ascend.models.layers.mla.get_tensor_model_parallel_world_size") "vllm_ascend.models.layers.mla.get_tensor_model_parallel_world_size")
def test_initialization(self, mock_tp_size, mock_ascend_config, def test_initialization(self, mock_tp_size, mock_ascend_config,
mock_get_vllm_config): mock_get_vllm_config):
if vllm_version_is("0.11.0"):
with patch("vllm_ascend.models.layers.mla.Attention",
return_value=True):
mock_tp_size.return_value = 1
mock_ascend_config.return_value.enable_shared_expert_dp = False
mock_vllm_config = MagicMock(spec=VllmConfig)
mock_vllm_config.model_config.hf_config = MagicMock(
num_hidden_layers=32, first_k_dense_replace=False)
mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig()
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
self.assertEqual(attn.hidden_size, self.hidden_size)
self.assertEqual(attn.kv_lora_rank, self.kv_lora_rank)
self.assertEqual(attn.debug_layer_idx, 0)
self.assertIsNotNone(attn.mla_attn)
self.assertIn(
self.prefix,
mock_vllm_config.compilation_config.static_forward_context)
else:
with patch("vllm_ascend.models.layers.mla.MLAAttention", with patch("vllm_ascend.models.layers.mla.MLAAttention",
return_value=True): return_value=True):
mock_tp_size.return_value = 2 mock_tp_size.return_value = 2
@@ -164,25 +130,6 @@ class TestAscendMultiHeadLatentAttention(TestBase):
num_hidden_layers=32, first_k_dense_replace=False) num_hidden_layers=32, first_k_dense_replace=False)
mock_get_vllm_config.return_value = mock_vllm_config mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig() mock_vllm_config.compilation_config = CompilationConfig()
if vllm_version_is("0.11.0"):
with patch("vllm_ascend.models.layers.mla.Attention",
return_value=True):
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
scale=self.scale,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
mla_modules=self.mock_mla_modules,
cache_config=self.mock_cache_config,
quant_config=self.mock_quant_config,
prefix=self.prefix,
)
else:
with patch("vllm_ascend.models.layers.mla.MLAAttention", with patch("vllm_ascend.models.layers.mla.MLAAttention",
return_value=True): return_value=True):
attn = AscendMultiHeadLatentAttention( attn = AscendMultiHeadLatentAttention(

View File

@@ -3,18 +3,13 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
import torch import torch
from vllm.config.compilation import CUDAGraphMode from vllm.config.compilation import CompilationMode, CUDAGraphMode
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import PlatformEnum from vllm.platforms import PlatformEnum
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.platform import NPUPlatform from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
if vllm_version_is("0.11.0"):
from vllm.config.compilation import CompilationLevel
else:
from vllm.config.compilation import CompilationMode
class TestNPUPlatform(TestBase): class TestNPUPlatform(TestBase):
@@ -313,12 +308,6 @@ class TestNPUPlatform(TestBase):
self.assertTrue("Compilation disabled, using eager mode by default" in self.assertTrue("Compilation disabled, using eager mode by default" in
cm.output[0]) cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual( self.assertEqual(
vllm_config.compilation_config.mode, vllm_config.compilation_config.mode,
CompilationMode.NONE, CompilationMode.NONE,
@@ -348,9 +337,6 @@ class TestNPUPlatform(TestBase):
mock_init_recompute.return_value = MagicMock() mock_init_recompute.return_value = MagicMock()
vllm_config.scheduler_config = MagicMock() vllm_config.scheduler_config = MagicMock()
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE
else:
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
with self.assertLogs(logger="vllm", level="WARNING") as cm: with self.assertLogs(logger="vllm", level="WARNING") as cm:
@@ -359,12 +345,7 @@ class TestNPUPlatform(TestBase):
importlib.reload(platform) importlib.reload(platform)
self.platform.check_and_update_config(vllm_config) self.platform.check_and_update_config(vllm_config)
self.assertTrue("NPU does not support" in cm.output[0]) self.assertTrue("NPU does not support" in cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual( self.assertEqual(
vllm_config.compilation_config.mode, vllm_config.compilation_config.mode,
CompilationMode.NONE, CompilationMode.NONE,
@@ -396,12 +377,6 @@ class TestNPUPlatform(TestBase):
"cudagraph_mode is not support on NPU. falling back to NONE" in "cudagraph_mode is not support on NPU. falling back to NONE" in
cm.output[0]) cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual( self.assertEqual(
vllm_config.compilation_config.mode, vllm_config.compilation_config.mode,
CompilationMode.NONE, CompilationMode.NONE,
@@ -431,9 +406,6 @@ class TestNPUPlatform(TestBase):
mock_init_recompute.return_value = MagicMock() mock_init_recompute.return_value = MagicMock()
vllm_config.scheduler_config = MagicMock() vllm_config.scheduler_config = MagicMock()
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.level = CompilationLevel.PIECEWISE
else:
vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
with self.assertLogs(logger="vllm", level="INFO") as cm: with self.assertLogs(logger="vllm", level="INFO") as cm:
@@ -443,12 +415,6 @@ class TestNPUPlatform(TestBase):
self.platform.check_and_update_config(vllm_config) self.platform.check_and_update_config(vllm_config)
self.assertTrue("Torchair compilation enabled" in cm.output[0]) self.assertTrue("Torchair compilation enabled" in cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
vllm_config.compilation_config.level,
CompilationLevel.NO_COMPILATION,
)
else:
self.assertEqual( self.assertEqual(
vllm_config.compilation_config.mode, vllm_config.compilation_config.mode,
CompilationMode.NONE, CompilationMode.NONE,
@@ -658,10 +624,7 @@ class TestNPUPlatform(TestBase):
def test_get_punica_wrapper(self): def test_get_punica_wrapper(self):
result = self.platform.get_punica_wrapper() result = self.platform.get_punica_wrapper()
if vllm_version_is("0.11.0"):
self.assertEqual(
result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110")
else:
self.assertEqual(result, self.assertEqual(result,
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU") "vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
@@ -742,12 +705,7 @@ class TestNPUPlatform(TestBase):
self.assertTrue( self.assertTrue(
"PIECEWISE compilation enabled on NPU. use_inductor not supported - " "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
"using only ACL Graph mode" in cm.output[0]) "using only ACL Graph mode" in cm.output[0])
if vllm_version_is("0.11.0"):
self.assertEqual(
VllmConfig.compilation_config.level,
CompilationLevel.PIECEWISE,
)
else:
self.assertEqual( self.assertEqual(
VllmConfig.compilation_config.mode, VllmConfig.compilation_config.mode,
CompilationMode.VLLM_COMPILE, CompilationMode.VLLM_COMPILE,

View File

@@ -274,46 +274,8 @@ class TestUtils(TestBase):
utils.update_aclgraph_sizes(test_vllm_config) utils.update_aclgraph_sizes(test_vllm_config)
del os.environ['HCCL_OP_EXPANSION_MODE'] del os.environ['HCCL_OP_EXPANSION_MODE']
if utils.vllm_version_is("0.11.0"):
self.assertEqual(
137,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes
))
else:
self.assertEqual( self.assertEqual(
0, 0,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes
))
return
test_vllm_config.speculative_config = mock.MagicMock()
test_vllm_config.speculative_config.num_speculative_tokens = 2
test_vllm_config.speculative_config.draft_model_config = mock.MagicMock(
)
test_vllm_config.speculative_config.draft_model_config.hf_config = mock.MagicMock(
)
test_vllm_config.speculative_config.draft_model_config.hf_config.num_hidden_layers = 2
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
utils.update_aclgraph_sizes(test_vllm_config)
del os.environ['HCCL_OP_EXPANSION_MODE']
self.assertEqual(
111,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
# max_num_batch_sizes >= len(original_sizes)
test_compilation_config = CompilationConfig(
cudagraph_capture_sizes=[1, 2, 3])
test_vllm_config = VllmConfig(
model_config=test_model_config,
compilation_config=test_compilation_config,
parallel_config=test_parallel_config,
)
utils.update_aclgraph_sizes(test_vllm_config)
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
utils.update_aclgraph_sizes(test_vllm_config)
del os.environ['HCCL_OP_EXPANSION_MODE']
self.assertEqual(
3,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes)) len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
@mock.patch("vllm.model_executor.custom_op.CustomOp") @mock.patch("vllm.model_executor.custom_op.CustomOp")

View File

@@ -7,7 +7,6 @@ from vllm.config import CacheConfig, VllmConfig
from tests.ut.base import PytestBase from tests.ut.base import PytestBase
from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer
from vllm_ascend.utils import vllm_version_is
class TestTorchairMtpProposer(PytestBase): class TestTorchairMtpProposer(PytestBase):
@@ -40,12 +39,6 @@ class TestTorchairMtpProposer(PytestBase):
mocker.patch( mocker.patch(
"vllm_ascend.torchair.torchair_mtp_proposer.MtpProposer.__init__", "vllm_ascend.torchair.torchair_mtp_proposer.MtpProposer.__init__",
return_value=None) return_value=None)
if vllm_version_is("0.11.0"):
mock_set_default_dtype = mocker.patch(
'vllm.model_executor.model_loader.utils.set_default_torch_dtype'
)
else:
mock_set_default_dtype = mocker.patch( mock_set_default_dtype = mocker.patch(
'vllm.utils.torch_utils.set_default_torch_dtype') 'vllm.utils.torch_utils.set_default_torch_dtype')
mock_set_default_dtype.return_value.__enter__.return_value = None mock_set_default_dtype.return_value.__enter__.return_value = None

View File

@@ -4,10 +4,8 @@ import torch
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is
init_cache_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is( init_cache_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
"0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
class TestNPUTorchairWorker(TestBase): class TestNPUTorchairWorker(TestBase):

View File

@@ -20,19 +20,14 @@ import numpy as np
import pytest import pytest
import torch import torch
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if vllm_version_is("0.11.0"):
from vllm.utils import make_tensor_with_pad
else:
from vllm.utils.torch_utils import make_tensor_with_pad
VOCAB_SIZE = 1024 VOCAB_SIZE = 1024
NUM_OUTPUT_TOKENS = 20 NUM_OUTPUT_TOKENS = 20
MAX_PROMPT_SIZE = 100 MAX_PROMPT_SIZE = 100

View File

@@ -6,10 +6,8 @@ import torch
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is
init_cached_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is( init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
"0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
class TestNPUWorker(TestBase): class TestNPUWorker(TestBase):
@@ -189,17 +187,6 @@ class TestNPUWorker(TestBase):
# Create NPUWorker instance # Create NPUWorker instance
from vllm_ascend.worker.worker_v1 import NPUWorker from vllm_ascend.worker.worker_v1 import NPUWorker
if vllm_version_is("0.11.0"):
with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}):
worker = NPUWorker(
vllm_config=self.vllm_config_mock,
local_rank=self.local_rank,
rank=self.rank,
distributed_init_method=self.distributed_init_method,
is_driver_worker=self.is_driver_worker,
)
else:
with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE", with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE",
{"float32": torch.float32}): {"float32": torch.float32}):
worker = NPUWorker( worker = NPUWorker(

View File

@@ -31,14 +31,7 @@ from vllm.distributed import (get_dcp_group,
get_decode_context_model_parallel_rank, get_decode_context_model_parallel_rank,
get_decode_context_model_parallel_world_size) get_decode_context_model_parallel_world_size)
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec

View File

@@ -20,14 +20,7 @@ from vllm.forward_context import ForwardContext, get_forward_context
from vllm.logger import logger from vllm.logger import logger
from vllm.model_executor.layers.linear import (LinearBase, from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod) UnquantizedLinearMethod)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv, round_down
else:
from vllm.utils.math_utils import cdiv, round_down from vllm.utils.math_utils import cdiv, round_down
from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm_ascend import envs from vllm_ascend import envs

View File

@@ -55,8 +55,6 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.utils import ConstantList from vllm.v1.utils import ConstantList
from vllm_ascend.utils import vllm_version_is
class RecomputeScheduler(SchedulerInterface): class RecomputeScheduler(SchedulerInterface):
"""This Scheduler extends vllm's original v1 scheduler of version 0.11 """This Scheduler extends vllm's original v1 scheduler of version 0.11
@@ -587,11 +585,6 @@ class RecomputeScheduler(SchedulerInterface):
self.kv_cache_config.kv_cache_groups) self.kv_cache_config.kv_cache_groups)
if self.running: if self.running:
any_request = self.running[0] any_request = self.running[0]
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = ( num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks( self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id)) any_request.request_id))

View File

@@ -22,14 +22,7 @@ from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVEventBatch from vllm.distributed.kv_events import KVEventBatch
from vllm.logger import logger from vllm.logger import logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.core.sched.scheduler import Scheduler
@@ -39,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class AscendScheduler(Scheduler): class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler """This Scheduler extends vllm's original v1 scheduler
@@ -71,14 +62,9 @@ class AscendScheduler(Scheduler):
log_stats: bool = False, log_stats: bool = False,
) -> None: ) -> None:
# Call the parent class's __init__ method # Call the parent class's __init__ method
if vllm_version_is("0.11.0"):
super().__init__(vllm_config, kv_cache_config, super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry, structured_output_manager, block_size, mm_registry,
include_finished_set, log_stats) include_finished_set, log_stats)
else:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, block_size,
mm_registry, include_finished_set, log_stats)
# Initialize common attributes # Initialize common attributes
self._initialize_common() self._initialize_common()
@@ -462,11 +448,6 @@ class AscendScheduler(Scheduler):
self.kv_cache_config.kv_cache_groups) self.kv_cache_config.kv_cache_groups)
if self.running: if self.running:
any_request = self.running[0] any_request = self.running[0]
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = ( num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks( self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id)) any_request.request_id))

View File

@@ -33,8 +33,6 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class BudgetRefiner: class BudgetRefiner:
"""This budget refiner can make dynamic adjustment to the token budget """This budget refiner can make dynamic adjustment to the token budget
@@ -130,14 +128,9 @@ class SchedulerDynamicBatch(Scheduler):
include_finished_set: bool = False, include_finished_set: bool = False,
log_stats: bool = False, log_stats: bool = False,
) -> None: ) -> None:
if vllm_version_is("0.11.0"):
super().__init__(vllm_config, kv_cache_config, super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry, structured_output_manager, block_size, mm_registry,
include_finished_set, log_stats) include_finished_set, log_stats)
else:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, block_size,
mm_registry, include_finished_set, log_stats)
self.running: list[Request] = [] self.running: list[Request] = []
self.budget_refiner = BudgetRefiner( self.budget_refiner = BudgetRefiner(
default_budget=self.scheduler_config.max_num_batched_tokens, default_budget=self.scheduler_config.max_num_batched_tokens,
@@ -540,11 +533,6 @@ class SchedulerDynamicBatch(Scheduler):
self.kv_cache_config.kv_cache_groups) self.kv_cache_config.kv_cache_groups)
if self.running: if self.running:
any_request = self.running[0] any_request = self.running[0]
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = ( num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks( self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id)) any_request.request_id))

View File

@@ -10,17 +10,12 @@ import vllm.envs as envs
import zmq import zmq
from vllm.config import KVTransferConfig, VllmConfig from vllm.config import KVTransferConfig, VllmConfig
from vllm.utils import logger from vllm.utils import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec
from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \ from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \
CPUKVCacheManager CPUKVCacheManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_dtype_size, make_zmq_socket
else:
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
@dataclass @dataclass

View File

@@ -33,16 +33,12 @@ from vllm.v1.request import Request, RequestStatus
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
from vllm_ascend.distributed.utils import get_transfer_timeout_value from vllm_ascend.distributed.utils import get_transfer_timeout_value
from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version, from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version,
prefill_context_parallel_enable, prefill_context_parallel_enable)
vllm_version_is)
if prefill_context_parallel_enable(): if prefill_context_parallel_enable():
from vllm.distributed.parallel_state import \ from vllm.distributed.parallel_state import \
get_prefill_context_model_parallel_rank get_prefill_context_model_parallel_rank
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip from vllm.utils.network_utils import get_ip
TORCH_DTYPE_TO_NPU_DTYPE = { TORCH_DTYPE_TO_NPU_DTYPE = {

View File

@@ -10,14 +10,7 @@ import torch
from vllm.distributed.kv_transfer.kv_connector.v1.base import \ from vllm.distributed.kv_transfer.kv_connector.v1.base import \
KVConnectorMetadata KVConnectorMetadata
from vllm.utils import logger from vllm.utils import logger
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.v1.core.sched.output import NewRequestData from vllm.v1.core.sched.output import NewRequestData
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB

View File

@@ -8,6 +8,7 @@ from typing import Generator, List, Optional, Union
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.utils import logger from vllm.utils import logger
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
from vllm_ascend.distributed.mooncake.config_data import ( from vllm_ascend.distributed.mooncake.config_data import (
ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata, ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata,
@@ -16,12 +17,6 @@ from vllm_ascend.distributed.mooncake.kv_transfer import (
KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread, KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread,
KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread) KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread)
from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_kv_cache_torch_dtype
else:
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
class MooncakeEngine: class MooncakeEngine:

View File

@@ -6,18 +6,13 @@ from mooncake.store import ReplicateConfig # type: ignore
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
from vllm.utils import logger from vllm.utils import logger
from vllm.utils.network_utils import get_ip
from vllm_ascend.distributed.mooncake.config_data import MooncakeEngineKey from vllm_ascend.distributed.mooncake.config_data import MooncakeEngineKey
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
from vllm_ascend.utils import vllm_version_is
from .config_data import MooncakeStoreConfig from .config_data import MooncakeStoreConfig
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip
METADATA_BYTES_LEN = 24 METADATA_BYTES_LEN = 24
BASE_PORT = int(os.getenv("VLLM_BASE_PORT", "8790")) BASE_PORT = int(os.getenv("VLLM_BASE_PORT", "8790"))

View File

@@ -10,6 +10,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.utils import logger from vllm.utils import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import Request from vllm.v1.request import Request
@@ -18,12 +19,6 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
from vllm_ascend.distributed.mooncake.config_data import ( from vllm_ascend.distributed.mooncake.config_data import (
LoadSpec, MooncakeConnectorMetadata, ReqMeta, RequestTracker) LoadSpec, MooncakeConnectorMetadata, ReqMeta, RequestTracker)
from vllm_ascend.distributed.mooncake.mooncake_engine import MooncakeEngine from vllm_ascend.distributed.mooncake.mooncake_engine import MooncakeEngine
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import make_zmq_socket
else:
from vllm.utils.network_utils import make_zmq_socket
class MooncakeConnectorV1(KVConnectorBase_V1): class MooncakeConnectorV1(KVConnectorBase_V1):

View File

@@ -37,7 +37,7 @@ import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te
from vllm_ascend.distributed.utils import get_transfer_timeout_value from vllm_ascend.distributed.utils import get_transfer_timeout_value
from vllm_ascend.utils import prefill_context_parallel_enable, vllm_version_is from vllm_ascend.utils import prefill_context_parallel_enable
# isort: off # isort: off
if prefill_context_parallel_enable(): if prefill_context_parallel_enable():
@@ -46,9 +46,6 @@ if prefill_context_parallel_enable():
) )
# isort: on # isort: on
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
else:
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
if TYPE_CHECKING: if TYPE_CHECKING:

View File

@@ -28,6 +28,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group, get_world_group) get_tp_group, get_world_group)
from vllm.utils import logger from vllm.utils import logger
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
@@ -35,12 +36,6 @@ from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.utils import (align_memory, from vllm_ascend.distributed.utils import (align_memory,
get_transfer_timeout_value, get_transfer_timeout_value,
kv_alltoall_and_rearrange) kv_alltoall_and_rearrange)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip, make_zmq_path, make_zmq_socket
else:
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata

View File

@@ -2,17 +2,11 @@ import numpy as np
import torch import torch
from vllm.attention import AttentionBackend from vllm.attention import AttentionBackend
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
TransferResult, TransferSpec) TransferResult, TransferSpec)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import is_pin_memory_available
else:
from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@@ -349,64 +349,3 @@ class PunicaWrapperNPU(PunicaWrapperBase):
bgmv_expand(buffer, lora_b_stacked, y, indices, add_inputs=True) bgmv_expand(buffer, lora_b_stacked, y, indices, add_inputs=True)
y = y.view_as(y_org) y = y.view_as(y_org)
class PunicaWrapperNPU0110(PunicaWrapperNPU):
# NOTE: remove me when 0.11.0 id dropped
def add_lora_linear( # type: ignore[override]
self,
y: torch.Tensor,
x: torch.Tensor,
lora_a_stacked: Tuple[torch.Tensor, ...],
lora_b_stacked: Tuple[torch.Tensor, ...],
lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
scale: float,
output_slices: Tuple[int, ...],
*,
buffer: Optional[Tuple[torch.Tensor, ...]] = None,
**kwargs) -> None:
"""
Applicable to linear-related lora.
Semantics:
for i in range(len(lora_a_stacked)):
y[i] += (
x[i].unsqueeze(0)
@ lora_a_stacked[indices[i], layer_idx, :, :]
@ lora_b_stacked[indices[i], layer_idx, :, :]
* scale
).squeeze(0)+lora_bias_stacked[i]
Args:
y (torch.Tensor): Output tensor. Will be changed in-place.
x (torch.Tensor): Input tensor
lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
scale (float): Scaling factor.
output_slices (Tuple[int, ...]): Every slice's size.
buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
"""
assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
if lora_bias_stacked is not None:
assert len(lora_bias_stacked) == len(output_slices)
y = self._apply_bias(self.token_lora_indices, y, output_slices,
lora_bias_stacked)
if buffer is None:
r = lora_b_stacked[0].size(-1)
# We set the buffer to be float32 by default, consistent with the
# triton op
buffer = tuple(
torch.zeros(
(x.size(0), r), dtype=torch.float32, device=x.device)
for _ in range(len(output_slices)))
self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
self.add_expand(y,
buffer,
lora_b_stacked,
None,
output_slices,
add_inputs=True,
**kwargs)

View File

@@ -29,18 +29,12 @@ from vllm.model_executor.model_loader.base_loader import BaseModelLoader
from vllm.model_executor.model_loader.default_loader import DefaultModelLoader from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
from vllm.model_executor.model_loader.utils import ( from vllm.model_executor.model_loader.utils import (
initialize_model, process_weights_after_loading) initialize_model, process_weights_after_loading)
from vllm.utils.torch_utils import set_default_torch_dtype
from vllm_ascend.utils import vllm_version_is
from .interaction.elastic import ElasticServer from .interaction.elastic import ElasticServer
from .load import elastic_load from .load import elastic_load
from .utils import find_free_port, is_valid_path_prefix from .utils import find_free_port, is_valid_path_prefix
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
else:
from vllm.utils.torch_utils import set_default_torch_dtype
@register_model_loader("netloader") @register_model_loader("netloader")
class ModelNetLoaderElastic(BaseModelLoader): class ModelNetLoaderElastic(BaseModelLoader):
@@ -207,9 +201,7 @@ class ModelNetLoaderElastic(BaseModelLoader):
if model is not None and ( if model is not None and (
(self.listen_port and self.listen_port in range(1024, 65535)) or (self.listen_port and self.listen_port in range(1024, 65535)) or
(self.listen_port is None)): (self.listen_port is None)):
if vllm_version_is("0.11.0"):
from vllm.utils import get_ip
else:
from vllm.utils.network_utils import get_ip from vllm.utils.network_utils import get_ip
driver_ip = get_ip() driver_ip = get_ip()

View File

@@ -24,32 +24,16 @@ from typing import Optional
import torch import torch
from torch import nn from torch import nn
from vllm.attention import AttentionMetadata from vllm.attention import AttentionMetadata
from vllm.attention.layer import MLAAttention
from vllm.config import CacheConfig, get_current_vllm_config from vllm.config import CacheConfig, get_current_vllm_config
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.layers.mla import MLAModules from vllm.model_executor.layers.mla import (MLAModules,
MultiHeadLatentAttentionWrapper)
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.attention import Attention
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
from vllm.utils import direct_register_custom_op
else:
from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
from vllm.utils.torch_utils import direct_register_custom_op from vllm.utils.torch_utils import direct_register_custom_op
if vllm_version_is("0.11.0"): from vllm_ascend.ascend_config import get_ascend_config
from vllm.attention import Attention
from vllm.model_executor.layers.mla import \
MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper
else:
from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
class IndexerWrapper(nn.Module): class IndexerWrapper(nn.Module):
@@ -81,7 +65,6 @@ class IndexerWrapper(nn.Module):
return return
# TODO(whx): adapt v0.11.0 and DSA
class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
def __init__( def __init__(
@@ -119,37 +102,6 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
ascend_indexer = IndexerWrapper(mla_modules.indexer) ascend_indexer = IndexerWrapper(mla_modules.indexer)
else: else:
ascend_indexer = None ascend_indexer = None
if vllm_version_is("0.11.0"):
self.mla_attn = Attention(
num_heads=num_heads,
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
scale=scale,
num_kv_heads=1,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
use_mla=True,
indexer=ascend_indexer,
use_sparse=mla_modules.is_sparse,
# MLA Args
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
v_head_dim=self.v_head_dim,
qk_head_dim=self.qk_head_dim,
rotary_emb=mla_modules.rotary_emb,
fused_qkv_a_proj=mla_modules.fused_qkv_a_proj,
q_b_proj=mla_modules.q_b_proj,
q_a_layernorm=mla_modules.q_a_layernorm,
q_proj=mla_modules.q_proj,
kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa,
kv_a_layernorm=mla_modules.kv_a_layernorm,
kv_b_proj=mla_modules.kv_b_proj,
o_proj=mla_modules.o_proj,
)
else:
self.mla_attn = MLAAttention( self.mla_attn = MLAAttention(
num_heads=num_heads, num_heads=num_heads,
scale=scale, scale=scale,

View File

@@ -40,14 +40,11 @@ from vllm.model_executor.models.qwen2_5_vl import (
Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration,
Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo)
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.models.vision import conv3d_to_linear_weight
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.ascend_forward_context import set_ascend_forward_context
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
vllm_version_is)
if not vllm_version_is("0.11.0"):
from vllm.model_executor.models.vision import conv3d_to_linear_weight
MIN_PAD_SIZE = 64 # min_size to pad weight MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight
@@ -360,7 +357,6 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
params_dict = dict(self.named_parameters(remove_duplicate=False)) params_dict = dict(self.named_parameters(remove_duplicate=False))
loaded_params: Set[str] = set() loaded_params: Set[str] = set()
for name, loaded_weight in weights: for name, loaded_weight in weights:
if not vllm_version_is("0.11.0"):
if name.endswith("patch_embed.proj.weight"): if name.endswith("patch_embed.proj.weight"):
loaded_weight = conv3d_to_linear_weight(loaded_weight) loaded_weight = conv3d_to_linear_weight(loaded_weight)
for (param_name, weight_name, shard_id) in stacked_params_mapping: for (param_name, weight_name, shard_id) in stacked_params_mapping:
@@ -537,9 +533,6 @@ class AscendQwen2_5_VLForConditionalGeneration(
image_embeds = image_input["image_embeds"].type(self.visual.dtype) image_embeds = image_input["image_embeds"].type(self.visual.dtype)
else: else:
pixel_values = image_input["pixel_values"].type(self.visual.dtype) pixel_values = image_input["pixel_values"].type(self.visual.dtype)
if vllm_version_is("0.11.0"):
image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
else:
with set_ascend_forward_context(None, self.vllm_config): with set_ascend_forward_context(None, self.vllm_config):
image_embeds = self.visual(pixel_values, grid_thw=grid_thw) image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
@@ -558,10 +551,6 @@ class AscendQwen2_5_VLForConditionalGeneration(
else: else:
pixel_values_videos = video_input["pixel_values_videos"].type( pixel_values_videos = video_input["pixel_values_videos"].type(
self.visual.dtype) self.visual.dtype)
if vllm_version_is("0.11.0"):
video_embeds = self.visual(pixel_values_videos,
grid_thw=grid_thw)
else:
with set_ascend_forward_context(None, self.vllm_config): with set_ascend_forward_context(None, self.vllm_config):
video_embeds = self.visual(pixel_values_videos, video_embeds = self.visual(pixel_values_videos,
grid_thw=grid_thw) grid_thw=grid_thw)

View File

@@ -38,13 +38,10 @@ from vllm.model_executor.models.qwen2_vl import (
Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor, Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor,
Qwen2VLProcessingInfo) Qwen2VLProcessingInfo)
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.models.vision import conv3d_to_linear_weight
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
vllm_version_is)
if not vllm_version_is("0.11.0"):
from vllm.model_executor.models.vision import conv3d_to_linear_weight
MIN_PAD_SIZE = 64 # min_size to pad weight MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight
@@ -308,7 +305,6 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer):
loaded_params: Set[str] = set() loaded_params: Set[str] = set()
for name, loaded_weight in weights: for name, loaded_weight in weights:
if not vllm_version_is("0.11.0"):
if name.endswith("patch_embed.proj.weight"): if name.endswith("patch_embed.proj.weight"):
loaded_weight = conv3d_to_linear_weight(loaded_weight) loaded_weight = conv3d_to_linear_weight(loaded_weight)

View File

@@ -50,8 +50,6 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.transformers_utils.configs import Qwen3NextConfig
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
from vllm_ascend.utils import vllm_version_is
from vllm.model_executor.models.qwen3_next import ( # isort: skip from vllm.model_executor.models.qwen3_next import ( # isort: skip
Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextForCausalLM, Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextForCausalLM,
Qwen3NextGatedDeltaNet, Qwen3NextModel, Qwen3NextSparseMoeBlock, Qwen3NextGatedDeltaNet, Qwen3NextModel, Qwen3NextSparseMoeBlock,
@@ -202,9 +200,6 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
spec_query_start_loc = attn_metadata.spec_query_start_loc spec_query_start_loc = attn_metadata.spec_query_start_loc
non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
spec_sequence_masks = attn_metadata.spec_sequence_masks spec_sequence_masks = attn_metadata.spec_sequence_masks
if vllm_version_is("0.11.0"):
spec_token_masks = attn_metadata.spec_token_masks
else:
spec_token_indx = attn_metadata.spec_token_indx spec_token_indx = attn_metadata.spec_token_indx
non_spec_token_indx = attn_metadata.non_spec_token_indx non_spec_token_indx = attn_metadata.non_spec_token_indx
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
@@ -221,9 +216,6 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
# 1. Set up dimensions for reshapes later # 1. Set up dimensions for reshapes later
projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens]) projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens])
if vllm_version_is("0.11.0"):
if spec_token_masks is not None:
spec_token_masks = spec_token_masks[:num_actual_tokens]
projected_states_qkvz, projected_states_ba = torch.split( projected_states_qkvz, projected_states_ba = torch.split(
projected_states, projected_states,
[ [
@@ -247,10 +239,6 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
and attn_metadata.num_decodes == 0): and attn_metadata.num_decodes == 0):
mixed_qkv_spec = mixed_qkv mixed_qkv_spec = mixed_qkv
mixed_qkv_non_spec = None mixed_qkv_non_spec = None
else:
if vllm_version_is("0.11.0"):
mixed_qkv_spec = mixed_qkv[spec_token_masks]
mixed_qkv_non_spec = mixed_qkv[~spec_token_masks]
else: else:
mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx) mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
mixed_qkv_non_spec = mixed_qkv.index_select( mixed_qkv_non_spec = mixed_qkv.index_select(
@@ -321,12 +309,6 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
beta_spec = beta beta_spec = beta
g_non_spec = None g_non_spec = None
beta_non_spec = None beta_non_spec = None
else:
if vllm_version_is("0.11.0"):
g_spec = g[:, spec_token_masks]
beta_spec = beta[:, spec_token_masks]
g_non_spec = g[:, ~spec_token_masks]
beta_non_spec = beta[:, ~spec_token_masks]
else: else:
g_spec = g.index_select(1, spec_token_indx) g_spec = g.index_select(1, spec_token_indx)
beta_spec = beta.index_select(1, spec_token_indx) beta_spec = beta.index_select(1, spec_token_indx)
@@ -439,12 +421,7 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
dtype=core_attn_out_non_spec.dtype, dtype=core_attn_out_non_spec.dtype,
device=core_attn_out_non_spec.device, device=core_attn_out_non_spec.device,
) )
if vllm_version_is("0.11.0"): core_attn_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
core_attn_out[:, spec_token_masks] = core_attn_out_spec
core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec
else:
core_attn_out.index_copy_(1, spec_token_indx,
core_attn_out_spec)
core_attn_out.index_copy_(1, non_spec_token_indx, core_attn_out.index_copy_(1, non_spec_token_indx,
core_attn_out_non_spec) core_attn_out_non_spec)
elif spec_sequence_masks is not None: elif spec_sequence_masks is not None:

View File

@@ -19,7 +19,7 @@ from typing import Any, Callable, Optional
import torch import torch
import torch_npu import torch_npu
from vllm.config import get_current_vllm_config from vllm.config import CompilationMode, get_current_vllm_config
from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group, from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
tensor_model_parallel_all_reduce) tensor_model_parallel_all_reduce)
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
@@ -28,6 +28,8 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import ( from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map, FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
get_compressed_expert_map) get_compressed_expert_map)
from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
SharedFusedMoE
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ascend_forward_context import MoECommType
@@ -44,17 +46,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
is_enable_nz, npu_stream_switch, is_enable_nz, npu_stream_switch,
shared_expert_dp_enabled, shared_expert_dp_enabled,
shared_experts_calculation_stream, shared_experts_calculation_stream)
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE # type: ignore # isort:skip
else:
from vllm.config import CompilationMode
from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
SharedFusedMoE
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@@ -73,16 +65,9 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
if ascend_config.torchair_graph_config.enabled: if ascend_config.torchair_graph_config.enabled:
self.use_aclgraph = False self.use_aclgraph = False
else: else:
if vllm_version_is("0.11.0"): self.use_aclgraph = (vllm_config.compilation_config.mode
self.use_aclgraph = ( == CompilationMode.VLLM_COMPILE and
vllm_config.compilation_config.level not vllm_config.model_config.enforce_eager)
== CompilationLevel.PIECEWISE
and not vllm_config.model_config.enforce_eager)
else:
self.use_aclgraph = (
vllm_config.compilation_config.mode
== CompilationMode.VLLM_COMPILE
and not vllm_config.model_config.enforce_eager)
self.transpose = True self.transpose = True
@@ -209,10 +194,6 @@ class AscendFusedMoE(FusedMoE):
dtype=vllm_config.model_config.dtype) dtype=vllm_config.model_config.dtype)
# init moe. # init moe.
if vllm_version_is("0.11.0"):
self.local_num_experts, self.expert_map = determine_expert_map(
self.ep_size, self.ep_rank, self.global_num_experts)
else:
self.local_num_experts, self.expert_map, _ = determine_expert_map( self.local_num_experts, self.expert_map, _ = determine_expert_map(
self.ep_size, self.ep_rank, self.global_num_experts) self.ep_size, self.ep_rank, self.global_num_experts)
# static eplb initializing with expert_map_path # static eplb initializing with expert_map_path

View File

@@ -7,17 +7,12 @@ from vllm.distributed import (get_dp_group, get_ep_group,
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter) tensor_model_parallel_reduce_scatter)
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm.utils.torch_utils import direct_register_custom_op
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.utils import (npu_stream_switch, prefetch_stream, from vllm_ascend.utils import npu_stream_switch, prefetch_stream
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.utils import direct_register_custom_op
else:
from vllm.utils.torch_utils import direct_register_custom_op
def _maybe_all_gather_and_maybe_unpad_impl( def _maybe_all_gather_and_maybe_unpad_impl(

View File

@@ -3,22 +3,9 @@ import vllm.model_executor.models.config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.config import MambaModelConfig from vllm.model_executor.models.config import MambaModelConfig
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
else:
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
@classmethod @classmethod

View File

@@ -8,21 +8,14 @@ import vllm.v1.executor.multiproc_executor
from vllm import envs from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.utils.network_utils import (get_distributed_init_method,
get_loopback_ip, get_open_port)
from vllm.utils.system_utils import get_mp_context
from vllm.v1.executor.abstract import FailureCallback from vllm.v1.executor.abstract import FailureCallback
from vllm.v1.executor.multiproc_executor import ( from vllm.v1.executor.multiproc_executor import (
MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc, MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
set_multiprocessing_worker_envs) set_multiprocessing_worker_envs)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
get_mp_context, get_open_port)
else:
from vllm.utils.network_utils import (get_distributed_init_method,
get_loopback_ip, get_open_port)
from vllm.utils.system_utils import get_mp_context
class AscendMultiprocExecutor(MultiprocExecutor): class AscendMultiprocExecutor(MultiprocExecutor):
supports_pp: bool = True supports_pp: bool = True

View File

@@ -28,9 +28,3 @@ import vllm_ascend.patch.worker.patch_roberta # noqa
import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_weight_loader # noqa
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
import vllm_ascend.patch.worker.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa
import vllm_ascend.patch.worker.patch_deepseek_v3_2 # noqa

View File

@@ -1,94 +0,0 @@
import torch
import torch.nn as nn
from transformers import PretrainedConfig
from vllm.config import VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.models.deepseek_mtp import \
DeepSeekMultiTokenPredictorLayer
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
from vllm.model_executor.models.utils import maybe_prefix
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.compilation.decorators import support_torch_compile
from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP
class SharedHead(nn.Module):
def __init__(
self,
config: PretrainedConfig,
prefix: str,
quant_config: QuantizationConfig = None,
) -> None:
super().__init__()
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.head = ParallelLMHead(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "head"),
)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return self.norm(hidden_states)
def predictor_init(self, vllm_config: VllmConfig, prefix: str) -> None:
nn.Module.__init__(self)
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.eh_proj = nn.Linear(config.hidden_size * 2,
config.hidden_size,
bias=False)
# We don't need topk_indices_buffer in Ascend
topk_indices_buffer = None
self.shared_head = SharedHead(config=config,
prefix=prefix,
quant_config=quant_config)
self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix,
topk_indices_buffer)
def predictor_forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
previous_hidden_states: torch.Tensor,
inputs_embeds: torch.Tensor | None = None,
spec_step_index: int = 0,
) -> torch.Tensor:
assert inputs_embeds is not None
# masking inputs at position 0, as not needed by MTP
inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
inputs_embeds = self.enorm(inputs_embeds)
previous_hidden_states = self.hnorm(previous_hidden_states)
hidden_states = self.eh_proj(
torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
hidden_states, residual = self.mtp_block(positions=positions,
hidden_states=hidden_states,
residual=None)
hidden_states = residual + hidden_states
return hidden_states
# Patch this only for aclgraph support, as this is not support in vLLM 0.11.0
@support_torch_compile
class AscendDeepSeekMTP(DeepSeekMTP):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
DeepSeekMultiTokenPredictorLayer.__init__ = predictor_init
if vllm_version_is("0.11.0"):
DeepSeekMultiTokenPredictorLayer.forward = predictor_forward

View File

@@ -1,108 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from itertools import islice
from typing import Optional, Union
import torch
import vllm.model_executor.models.deepseek_v2
from torch import nn
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import get_pp_group
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.vocab_parallel_embedding import \
VocabParallelEmbedding
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
from vllm.model_executor.models.utils import (
PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers)
from vllm.sequence import IntermediateTensors
@support_torch_compile
class DeepseekV2Model(nn.Module):
fall_back_to_pt_during_load = False
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.config = config
self.vocab_size = config.vocab_size
self.is_v32 = hasattr(config, "index_topk")
topk_indices_buffer = None
if get_pp_group().is_first_rank:
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=f"{prefix}.embed_tokens")
else:
self.embed_tokens = PPMissingLayer()
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
lambda prefix: DeepseekV2DecoderLayer(vllm_config, prefix,
topk_indices_buffer),
prefix=f"{prefix}.layers")
if get_pp_group().is_last_rank:
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
else:
self.norm = PPMissingLayer()
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size))
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.embed_tokens(input_ids)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors],
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.get_input_embeddings(input_ids)
residual = None
else:
assert intermediate_tensors is not None
hidden_states = intermediate_tensors["hidden_states"]
residual = intermediate_tensors["residual"]
for layer in islice(self.layers, self.start_layer, self.end_layer):
hidden_states, residual = layer(positions, hidden_states, residual)
if not get_pp_group().is_last_rank:
return IntermediateTensors({
"hidden_states": hidden_states,
"residual": residual
})
hidden_states, _ = self.norm(hidden_states, residual)
return hidden_states
vllm.model_executor.models.deepseek_v2.DeepseekV2Model = DeepseekV2Model

View File

@@ -6,16 +6,11 @@ import vllm.model_executor.layers.mamba.ops.causal_conv1d
from vllm_ascend.ops.casual_conv1d import (causal_conv1d_fn, from vllm_ascend.ops.casual_conv1d import (causal_conv1d_fn,
causal_conv1d_update_npu) causal_conv1d_update_npu)
from vllm_ascend.ops.fla import LayerNormFn, torch_chunk_gated_delta_rule from vllm_ascend.ops.fla import LayerNormFn, torch_chunk_gated_delta_rule
from vllm_ascend.ops.sigmoid_gating import ( from vllm_ascend.ops.sigmoid_gating import \
fused_recurrent_gated_delta_rule_fwd_kernel, fused_recurrent_gated_delta_rule_fwd_kernel
fused_recurrent_gated_delta_rule_fwd_kernel_0_11_0)
from vllm_ascend.utils import vllm_version_is
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
if vllm_version_is('0.11.0'):
vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel_0_11_0
else:
vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule

View File

@@ -3,12 +3,6 @@ from torch.nn.parameter import Parameter
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@@ -34,7 +34,7 @@ from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
is_vl_model, prefill_context_parallel_enable, is_vl_model, prefill_context_parallel_enable,
update_aclgraph_sizes, update_aclgraph_sizes,
update_cudagraph_capture_sizes, update_cudagraph_capture_sizes,
update_default_aclgraph_sizes, vllm_version_is) update_default_aclgraph_sizes)
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
@@ -120,9 +120,6 @@ class NPUPlatform(Platform):
# initialize ascend config from vllm additional_config # initialize ascend config from vllm additional_config
ascend_config = init_ascend_config(vllm_config) ascend_config = init_ascend_config(vllm_config)
if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
else:
from vllm.config import CompilationMode # noqa: E402 from vllm.config import CompilationMode # noqa: E402
compilation_config = vllm_config.compilation_config compilation_config = vllm_config.compilation_config
@@ -149,22 +146,10 @@ class NPUPlatform(Platform):
from vllm.config.compilation import CUDAGraphMode from vllm.config.compilation import CUDAGraphMode
if enforce_eager: if enforce_eager:
logger.info("Compilation disabled, using eager mode by default") logger.info("Compilation disabled, using eager mode by default")
if vllm_version_is("0.11.0"):
compilation_config.level = CompilationLevel.NO_COMPILATION
else:
compilation_config.mode = CompilationMode.NONE compilation_config.mode = CompilationMode.NONE
compilation_config.cudagraph_num_of_warmups = 1 compilation_config.cudagraph_num_of_warmups = 1
if vllm_version_is("0.11.0"):
if compilation_config.level not in [
CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE
]:
logger.warning(
"NPU does not support %s compilation level. Setting CUDAGraphMode to NONE",
compilation_config.level)
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
else:
if compilation_config.mode not in [ if compilation_config.mode not in [
CompilationMode.NONE, CompilationMode.VLLM_COMPILE CompilationMode.NONE, CompilationMode.VLLM_COMPILE
]: ]:
@@ -211,59 +196,12 @@ class NPUPlatform(Platform):
f"{vllm_config.parallel_config.tensor_parallel_size}") f"{vllm_config.parallel_config.tensor_parallel_size}")
if len(sp_aclgraph_sizes) != len(original_sizes): if len(sp_aclgraph_sizes) != len(original_sizes):
compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
if vllm_version_is("0.11.0"): update_cudagraph_capture_sizes(vllm_config, sp_aclgraph_sizes)
compilation_config.init_with_cudagraph_sizes(
sp_aclgraph_sizes)
else:
update_cudagraph_capture_sizes(vllm_config,
sp_aclgraph_sizes)
# TODO: Full graph is fully supported later, and the default value will be set to full graph. # TODO: Full graph is fully supported later, and the default value will be set to full graph.
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
if vllm_version_is("0.11.0"):
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
compilation_config.level = CompilationLevel.NO_COMPILATION
elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
logger.info(
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
"using only ACL Graph mode")
assert compilation_config.level == CompilationLevel.PIECEWISE, \
"When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE"
compilation_config.set_splitting_ops_for_v1()
compilation_config.use_inductor = False
compilation_config.splitting_ops.extend([
"vllm.unified_ascend_attention_with_output",
"vllm.mla_forward"
])
update_aclgraph_sizes(vllm_config)
elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
logger.info(
"FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
"using only ACL Graph mode")
compilation_config.use_inductor = False
warning_message = """\033[91m
**********************************************************************************
* WARNING: You have enabled the *full graph* feature.
* This is an early experimental stage and may involve various unknown issues.
* A known problem is that capturing too many batch sizes can lead to OOM
* (Out of Memory) errors or inference hangs. If you encounter such issues,
* consider reducing `gpu_memory_utilization` or manually specifying a smaller
* batch size for graph capture.
* For more details, please refer to:
* https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
**********************************************************************************\033[0m
"""
logger.warning(warning_message)
else:
logger.info(
"%s cudagraph_mode is not support on NPU. falling back to NONE",
compilation_config.cudagraph_mode)
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
compilation_config.level = CompilationLevel.NO_COMPILATION
else:
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
compilation_config.mode = CompilationMode.NONE compilation_config.mode = CompilationMode.NONE
elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE: elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
@@ -315,9 +253,6 @@ class NPUPlatform(Platform):
if parallel_config and parallel_config.worker_cls == "auto": if parallel_config and parallel_config.worker_cls == "auto":
# TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm. # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
if vllm_version_is("0.11.0"):
os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
else:
parallel_config.all2all_backend = "flashinfer_all2allv" parallel_config.all2all_backend = "flashinfer_all2allv"
if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp: if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker" parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
@@ -443,9 +378,6 @@ class NPUPlatform(Platform):
@classmethod @classmethod
def get_punica_wrapper(cls) -> str: def get_punica_wrapper(cls) -> str:
if vllm_version_is("0.11.0"):
return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110"
else:
return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU" return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"
@classmethod @classmethod

View File

@@ -19,20 +19,14 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union
import torch import torch
import torch_npu import torch_npu
from vllm.config import get_current_vllm_config from vllm.config import CompilationMode, get_current_vllm_config
from vllm.distributed import get_ep_group from vllm.distributed import get_ep_group
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.fused_moe.experts_selector import select_experts from vllm_ascend.ops.fused_moe.experts_selector import select_experts
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, is_enable_nz, from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
else:
from vllm.config import CompilationMode
class AscendW8A8DynamicLinearMethod: class AscendW8A8DynamicLinearMethod:
@@ -129,16 +123,8 @@ class AscendW8A8DynamicFusedMoEMethod:
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
ascend_config = get_ascend_config() ascend_config = get_ascend_config()
if vllm_version_is("0.11.0"):
self.use_aclgraph = ( self.use_aclgraph = (
vllm_config.compilation_config.level vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
== CompilationLevel.PIECEWISE
and not vllm_config.model_config.enforce_eager
and not ascend_config.torchair_graph_config.enabled)
else:
self.use_aclgraph = (
vllm_config.compilation_config.mode
== CompilationMode.VLLM_COMPILE
and not vllm_config.model_config.enforce_eager and not vllm_config.model_config.enforce_eager
and not ascend_config.torchair_graph_config.enabled) and not ascend_config.torchair_graph_config.enabled)

View File

@@ -6,16 +6,10 @@ import torch.nn as nn
import vllm.v1.sample.rejection_sampler as rs import vllm.v1.sample.rejection_sampler as rs
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import (RejectionSampler, from vllm.v1.sample.rejection_sampler import (RejectionSampler,
apply_sampling_constraints,
generate_uniform_probs) generate_uniform_probs)
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.v1.sample.rejection_sampler import compute_probs
else:
from vllm.v1.sample.rejection_sampler import apply_sampling_constraints
PLACEHOLDER_TOKEN_ID = -1 PLACEHOLDER_TOKEN_ID = -1
GREEDY_TEMPERATURE = -1 GREEDY_TEMPERATURE = -1
# Maximum number of speculative draft tokens allowed per request in a single # Maximum number of speculative draft tokens allowed per request in a single
@@ -89,13 +83,6 @@ class AscendRejectionSampler(RejectionSampler, nn.Module):
# [num_tokens, vocab_size] # [num_tokens, vocab_size]
# NOTE(woosuk): `target_logits` can be updated in place inside the # NOTE(woosuk): `target_logits` can be updated in place inside the
# `compute_probs` function. # `compute_probs` function.
if vllm_version_is("0.11.0"):
target_probs = compute_probs(
target_logits,
metadata.cu_num_draft_tokens,
sampling_metadata,
)
else:
target_logits = apply_sampling_constraints( target_logits = apply_sampling_constraints(
target_logits, target_logits,
metadata.cu_num_draft_tokens, metadata.cu_num_draft_tokens,

View File

@@ -5,13 +5,15 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from vllm.attention.layer import Attention from vllm.attention.layer import Attention
from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
get_layers_from_vllm_config)
from vllm.distributed.parallel_state import get_pp_group from vllm.distributed.parallel_state import get_pp_group
from vllm.logger import logger from vllm.logger import logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models import supports_multimodal
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -22,14 +24,6 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
AscendMetadata) AscendMetadata)
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
from vllm.utils import is_pin_memory_available
else:
from vllm.config import CompilationMode
from vllm.utils.platform_utils import is_pin_memory_available
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1
@@ -52,16 +46,9 @@ class EagleProposer(Proposer):
self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size( self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size(
) )
if vllm_version_is("0.11.0"): self.use_cuda_graph = (self.vllm_config.compilation_config.mode
self.use_cuda_graph = ( == CompilationMode.VLLM_COMPILE and
self.vllm_config.compilation_config.level not self.vllm_config.model_config.enforce_eager)
== CompilationLevel.PIECEWISE
and not self.vllm_config.model_config.enforce_eager)
else:
self.use_cuda_graph = (
self.vllm_config.compilation_config.mode
== CompilationMode.VLLM_COMPILE
and not self.vllm_config.model_config.enforce_eager)
self.cudagraph_batch_sizes = list( self.cudagraph_batch_sizes = list(
reversed( reversed(

View File

@@ -15,14 +15,7 @@ from vllm.model_executor.model_loader.utils import \
process_weights_after_loading process_weights_after_loading
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
CommonAttentionMetadata) CommonAttentionMetadata)
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
@@ -39,16 +32,11 @@ from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
update_mla_attn_params) update_mla_attn_params)
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
prefill_context_parallel_enable, prefill_context_parallel_enable)
vllm_version_is)
if prefill_context_parallel_enable(): if prefill_context_parallel_enable():
from vllm.distributed import get_pcp_group from vllm.distributed import get_pcp_group
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.utils import is_pin_memory_available
else:
from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import set_default_torch_dtype from vllm.utils.torch_utils import set_default_torch_dtype
@@ -56,14 +44,9 @@ logger = init_logger(__name__)
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1
_deepseek_mtp_path = "vllm.model_executor.models.deepseek_mtp"
_deepseek_mtp_model = "DeepSeekMTP"
if vllm_version_is("0.11.0"):
_deepseek_mtp_path = "vllm_ascend.patch.worker.patch_deepseek_mtp"
_deepseek_mtp_model = "AscendDeepSeekMTP"
_MTP_MODELS = { _MTP_MODELS = {
"DeepseekV3ForCausalLM": (_deepseek_mtp_path, _deepseek_mtp_model), "DeepseekV3ForCausalLM":
("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"),
"Qwen3NextForCausalLM": "Qwen3NextForCausalLM":
("vllm_ascend.models.qwen3_next_mtp", "CustomQwen3NextMTP") ("vllm_ascend.models.qwen3_next_mtp", "CustomQwen3NextMTP")
} }

View File

@@ -23,7 +23,7 @@ from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, CompilationMode, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
get_tp_group) get_tp_group)
@@ -55,12 +55,6 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding, from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
init_metadata_for_sp) init_metadata_for_sp)
from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
else:
from vllm.config import CompilationMode
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -299,16 +293,10 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
layer_idx = extract_layer_index(prefix) layer_idx = extract_layer_index(prefix)
mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
config.mlp_only_layers) config.mlp_only_layers)
if vllm_version_is("0.11.0"):
self.use_aclgraph = (vllm_config is not None
and vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE and
not vllm_config.model_config.enforce_eager)
else:
self.use_aclgraph = (vllm_config is not None self.use_aclgraph = (vllm_config is not None
and vllm_config.compilation_config.mode and vllm_config.compilation_config.mode
== CompilationMode.VLLM_COMPILE and == CompilationMode.VLLM_COMPILE
not vllm_config.model_config.enforce_eager) and not vllm_config.model_config.enforce_eager)
if (layer_idx not in mlp_only_layers) and ( if (layer_idx not in mlp_only_layers) and (
config.num_experts > 0 and config.num_experts > 0 and
(layer_idx + 1) % config.decoder_sparse_step == 0): (layer_idx + 1) % config.decoder_sparse_step == 0):

View File

@@ -32,6 +32,7 @@ import torch_npu
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import AttentionMetadata from vllm.attention import AttentionMetadata
from vllm.attention.layer import MLAAttention
from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
@@ -74,12 +75,7 @@ from vllm_ascend.quantization.quant_config import AscendLinearMethod
from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \ from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \
TorchairAscendW8A8DynamicLinearMethod TorchairAscendW8A8DynamicLinearMethod
from vllm_ascend.utils import dispose_tensor, oproj_tp_enable, vllm_version_is from vllm_ascend.utils import dispose_tensor, oproj_tp_enable
if vllm_version_is("0.11.0"):
from vllm.attention import Attention
else:
from vllm.attention.layer import MLAAttention
class Indexer(nn.Module): class Indexer(nn.Module):
@@ -616,39 +612,6 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
# k_c.size(1) + k_pe.size(1) == kv_cache.size(2) # k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
# i.e. # i.e.
# kv_lora_rank + qk_rope_head_dim == head_size # kv_lora_rank + qk_rope_head_dim == head_size
if vllm_version_is("0.11.0"):
self.mla_attn = Attention(
num_heads=self.num_local_heads,
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
scale=self.scaling,
num_kv_heads=1,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
use_mla=True,
use_sparse=False,
indexer=None,
# SFA Args
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
qk_head_dim=self.qk_head_dim,
v_head_dim=self.v_head_dim,
rotary_emb=self.rotary_emb,
q_a_proj=self.q_a_proj
if self.q_lora_rank is not None else None,
q_a_layernorm=self.q_a_layernorm
if self.q_lora_rank is not None else None,
q_proj=self.q_proj
if self.q_lora_rank is None else self.q_b_proj,
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
kv_a_layernorm=self.kv_a_layernorm,
kv_b_proj=self.kv_b_proj,
o_proj=self.o_proj,
decoder_layer=decoder_layer,
)
else:
self.mla_attn = MLAAttention( self.mla_attn = MLAAttention(
num_heads=self.num_local_heads, num_heads=self.num_local_heads,
scale=self.scaling, scale=self.scaling,
@@ -664,14 +627,11 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
indexer=None, indexer=None,
# MLA Args # MLA Args
rotary_emb=self.rotary_emb, rotary_emb=self.rotary_emb,
q_a_proj=self.q_a_proj q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None,
if self.q_lora_rank is not None else None,
q_a_layernorm=self.q_a_layernorm q_a_layernorm=self.q_a_layernorm
if self.q_lora_rank is not None else None, if self.q_lora_rank is not None else None,
q_proj=self.q_proj q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
if self.q_lora_rank is None else self.q_b_proj, q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
q_b_proj=self.q_b_proj
if self.q_lora_rank is not None else None,
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
kv_a_layernorm=self.kv_a_layernorm, kv_a_layernorm=self.kv_a_layernorm,
kv_b_proj=self.kv_b_proj, kv_b_proj=self.kv_b_proj,
@@ -882,40 +842,6 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
index_topk=self.index_topk, index_topk=self.index_topk,
prefix=f"{prefix}.indexer", prefix=f"{prefix}.indexer",
) )
if vllm_version_is("0.11.0"):
self.sfa_attn = Attention(
num_heads=self.num_local_heads,
head_size=self.kv_lora_rank + self.qk_rope_head_dim,
scale=self.scaling,
num_kv_heads=1,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
use_mla=True,
use_sparse=True,
indexer=self.indexer,
# SFA Args
q_lora_rank=self.q_lora_rank,
kv_lora_rank=self.kv_lora_rank,
qk_nope_head_dim=self.qk_nope_head_dim,
qk_rope_head_dim=self.qk_rope_head_dim,
qk_head_dim=self.qk_head_dim,
v_head_dim=self.v_head_dim,
rotary_emb=self.rotary_emb,
q_a_proj=self.q_a_proj
if self.q_lora_rank is not None else None,
q_a_layernorm=self.q_a_layernorm
if self.q_lora_rank is not None else None,
q_proj=self.q_proj
if self.q_lora_rank is None else self.q_b_proj,
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
kv_a_layernorm=self.kv_a_layernorm,
kv_b_proj=self.kv_b_proj,
o_proj=self.o_proj,
decoder_layer=decoder_layer,
)
else:
self.sfa_attn = MLAAttention( self.sfa_attn = MLAAttention(
num_heads=self.num_local_heads, num_heads=self.num_local_heads,
scale=self.scaling, scale=self.scaling,
@@ -931,12 +857,10 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
indexer=self.indexer, indexer=self.indexer,
# MLA Args # MLA Args
rotary_emb=self.rotary_emb, rotary_emb=self.rotary_emb,
q_a_proj=self.q_a_proj q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None,
if self.q_lora_rank is not None else None,
q_a_layernorm=self.q_a_layernorm q_a_layernorm=self.q_a_layernorm
if self.q_lora_rank is not None else None, if self.q_lora_rank is not None else None,
q_proj=self.q_proj q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
if self.q_lora_rank is None else self.q_b_proj,
kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
kv_a_layernorm=self.kv_a_layernorm, kv_a_layernorm=self.kv_a_layernorm,
kv_b_proj=self.kv_b_proj, kv_b_proj=self.kv_b_proj,

View File

@@ -53,8 +53,7 @@ from vllm_ascend.torchair.utils import (get_all_reduce_merge_state,
super_kernel) super_kernel)
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
get_ascend_soc_version, is_310p, get_ascend_soc_version, is_310p,
is_hierarchical_communication_enabled, is_hierarchical_communication_enabled)
vllm_version_is)
def torchair_fused_experts_with_mc2( def torchair_fused_experts_with_mc2(
@@ -1069,10 +1068,6 @@ class TorchairAscendFusedMoE(FusedMoE):
get_compressed_expert_map(self.expert_map)) get_compressed_expert_map(self.expert_map))
else: else:
# init moe. # init moe.
if vllm_version_is("0.11.0"):
self.local_num_experts, self.expert_map = determine_expert_map(
self.ep_size, self.ep_rank, self.global_num_experts)
else:
self.local_num_experts, self.expert_map, _ = determine_expert_map( self.local_num_experts, self.expert_map, _ = determine_expert_map(
self.ep_size, self.ep_rank, self.global_num_experts) self.ep_size, self.ep_rank, self.global_num_experts)
# dynamic eplb initializing with not expert_map_path # dynamic eplb initializing with not expert_map_path

View File

@@ -26,12 +26,6 @@ from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer,
AttentionType) AttentionType)
from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.attention.backends.utils import PAD_SLOT_ID
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,

View File

@@ -12,12 +12,6 @@ from vllm.config import VllmConfig, get_current_vllm_config
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (LinearBase, from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod) UnquantizedLinearMethod)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv, round_down
else:
from vllm.utils.math_utils import cdiv, round_down from vllm.utils.math_utils import cdiv, round_down
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend

View File

@@ -11,6 +11,7 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader import get_model_loader
from vllm.model_executor.model_loader.utils import \ from vllm.model_executor.model_loader.utils import \
process_weights_after_loading process_weights_after_loading
from vllm.utils.torch_utils import set_default_torch_dtype
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -23,13 +24,7 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
TorchairDeepSeekMTP TorchairDeepSeekMTP
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
TorchairCommonAttentionMetadata) TorchairCommonAttentionMetadata)
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
vllm_version_is)
if vllm_version_is("0.11.0"):
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
else:
from vllm.utils.torch_utils import set_default_torch_dtype
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1

View File

@@ -12,12 +12,6 @@ from vllm.config import VllmConfig, get_current_vllm_config
from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
from vllm.model_executor.layers.linear import (LinearBase, from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod) UnquantizedLinearMethod)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv, round_down
else:
from vllm.utils.math_utils import cdiv, round_down from vllm.utils.math_utils import cdiv, round_down
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend

View File

@@ -412,13 +412,6 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
Check whether it is vLLM default capture sizes. Check whether it is vLLM default capture sizes.
""" """
if vllm_version_is("0.11.0"):
cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
if len(cuda_graph_sizes) == 1:
cudagraph_capture_sizes = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
else:
max_cudagraph_capture_size = \ max_cudagraph_capture_size = \
vllm_config.compilation_config.max_cudagraph_capture_size vllm_config.compilation_config.max_cudagraph_capture_size
cudagraph_capture_sizes = [ cudagraph_capture_sizes = [
@@ -432,12 +425,7 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
# Step size 16 for larger batch sizes # Step size 16 for larger batch sizes
cudagraph_capture_sizes += list( cudagraph_capture_sizes += list(
range(256, max_cudagraph_capture_size + 1, 16)) range(256, max_cudagraph_capture_size + 1, 16))
# in newer version, vLLM use ascending order of cudagraph_capture_sizes.
if vllm_version_is("0.11.0"):
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
reverse=True)
else:
# in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes) target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
if target_cudagraph_capture_sizes == \ if target_cudagraph_capture_sizes == \
vllm_config.compilation_config.cudagraph_capture_sizes: vllm_config.compilation_config.cudagraph_capture_sizes:
@@ -465,19 +453,11 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \ if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
and vllm_config.parallel_config.tensor_parallel_size == 1 \ and vllm_config.parallel_config.tensor_parallel_size == 1 \
and vllm_config.parallel_config.data_parallel_size > 1 : and vllm_config.parallel_config.data_parallel_size > 1 :
if vllm_version_is("0.11.0"):
max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
else:
max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [ new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
i for i in range(24, max_capture_size + 1, 8) i for i in range(24, max_capture_size + 1, 8)
] ]
if vllm_version_is("0.11.0"):
vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
vllm_config.compilation_config.init_with_cudagraph_sizes(
new_cudagraph_capture_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, update_cudagraph_capture_sizes(vllm_config,
new_cudagraph_capture_sizes) new_cudagraph_capture_sizes)
@@ -573,9 +553,6 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
indices[0], indices[-1] = 0, len(original_sizes) - 1 indices[0], indices[-1] = 0, len(original_sizes) - 1
sampled_sizes = [original_sizes[i] for i in indices] sampled_sizes = [original_sizes[i] for i in indices]
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, sampled_sizes) update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
logger.info( logger.info(
@@ -607,9 +584,6 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs: if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
enlarged_sizes = [(num_speculative_tokens + 1) * size enlarged_sizes = [(num_speculative_tokens + 1) * size
for size in original_sizes] for size in original_sizes]
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes) update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
logger.info( logger.info(
"Adjusted ACL graphs: %s%s for speculative decoding", "Adjusted ACL graphs: %s%s for speculative decoding",
@@ -719,11 +693,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
"GemmaRMSNorm": AscendGemmaRMSNorm, "GemmaRMSNorm": AscendGemmaRMSNorm,
"FusedMoE": AscendFusedMoE, "FusedMoE": AscendFusedMoE,
"SharedFusedMoE": AscendSharedFusedMoE, "SharedFusedMoE": AscendSharedFusedMoE,
"MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention,
} }
mla_to_register = "MultiHeadLatentAttention" if vllm_version_is(
"0.11.0") else "MultiHeadLatentAttentionWrapper"
if vllm_config and vllm_config.model_config and vllm_config.model_config.use_mla:
REGISTERED_ASCEND_OPS[mla_to_register] = AscendMultiHeadLatentAttention
for name, op_cls in REGISTERED_ASCEND_OPS.items(): for name, op_cls in REGISTERED_ASCEND_OPS.items():
CustomOp.register_oot(_decorated_op_cls=op_cls, name=name) CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)

View File

@@ -3,12 +3,6 @@ from typing import Optional, Union
import numpy as np import numpy as np
import torch import torch
from vllm.distributed import get_dcp_group from vllm.distributed import get_dcp_group
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm_ascend.utils import prefill_context_parallel_enable from vllm_ascend.utils import prefill_context_parallel_enable

View File

@@ -41,10 +41,11 @@ import torch.nn as nn
from tqdm import tqdm # type: ignore from tqdm import tqdm # type: ignore
from vllm.attention import AttentionType, get_attn_backend from vllm.attention import AttentionType, get_attn_backend
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention from vllm.attention.layer import Attention, MLAAttention
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.compilation.monitor import set_cudagraph_capturing_enabled
from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
get_layers_from_vllm_config)
from vllm.distributed import tensor_model_parallel_all_gather from vllm.distributed import tensor_model_parallel_all_gather
from vllm.distributed.kv_transfer import (get_kv_transfer_group, from vllm.distributed.kv_transfer import (get_kv_transfer_group,
has_kv_transfer_group) has_kv_transfer_group)
@@ -58,8 +59,6 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.mamba.abstract import MambaBase
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.models.interfaces import (SupportsMultiModal, from vllm.model_executor.models.interfaces import (SupportsMultiModal,
supports_mrope, supports_mrope,
supports_transcription) supports_transcription)
@@ -73,29 +72,23 @@ from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.import_utils import LazyLoader
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.11.0"):
from vllm.utils import cdiv
else:
from vllm.utils.math_utils import cdiv
from vllm.utils.jsontree import json_map_leaves from vllm.utils.jsontree import json_map_leaves
from vllm.utils.math_utils import cdiv
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
AttentionCGSupport, CommonAttentionMetadata, AttentionCGSupport, CommonAttentionMetadata,
reorder_batch_to_split_decodes_and_prefills) reorder_batch_to_split_decodes_and_prefills)
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.kv_cache_interface import (AttentionSpec, from vllm.v1.kv_cache_interface import (AttentionSpec,
EncoderOnlyAttentionSpec, EncoderOnlyAttentionSpec,
FullAttentionSpec, KVCacheConfig, FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheSpec, KVCacheGroupSpec, KVCacheSpec,
MambaSpec, MLAAttentionSpec, MambaSpec, MLAAttentionSpec,
UniformTypeKVCacheSpecs) UniformTypeKVCacheSpecs)
# yapf: enable
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
DraftTokenIds, LogprobsTensors, ModelRunnerOutput, DraftTokenIds, LogprobsTensors, ModelRunnerOutput,
PoolerOutput) PoolerOutput)
@@ -119,6 +112,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
AscendPrefillContextParallelMetadata) AscendPrefillContextParallelMetadata)
# yapf conflicts with isort for this block
# yapf: disable # yapf: disable
from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper, from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
set_graph_params, set_graph_params,
@@ -147,8 +141,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
AscendSocVersion, ProfileExecuteDuration, AscendSocVersion, ProfileExecuteDuration,
enable_sp, get_ascend_soc_version, is_310p, enable_sp, get_ascend_soc_version, is_310p,
is_enable_nz, is_moe_model, lmhead_tp_enable, is_enable_nz, is_moe_model, lmhead_tp_enable,
prefill_context_parallel_enable, prefill_context_parallel_enable)
vllm_version_is)
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if prefill_context_parallel_enable(): if prefill_context_parallel_enable():
@@ -157,27 +150,6 @@ if prefill_context_parallel_enable():
get_prefill_context_model_parallel_rank, get_prefill_context_model_parallel_rank,
get_prefill_context_model_parallel_world_size) get_prefill_context_model_parallel_world_size)
if vllm_version_is("0.11.0"):
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
get_dtype_size)
else:
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
# yapf: enable
if vllm_version_is("0.11.0"):
from vllm.attention.layer import Attention
from vllm.config import CompilationLevel
from vllm.utils import LazyLoader, is_pin_memory_available
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
else:
from vllm.attention.layer import MLAAttention
from vllm.config import CompilationMode
from vllm.utils.import_utils import LazyLoader
from vllm.utils.platform_utils import is_pin_memory_available
if TYPE_CHECKING: if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped] import xgrammar as xgr # type: ignore[import-untyped]
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
@@ -637,11 +609,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
diagonal=1).to(self.device) diagonal=1).to(self.device)
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.drafter = self._get_drafter() self.drafter = self._get_drafter()
if vllm_version_is("0.11.0"): self.rejection_sampler = AscendRejectionSampler(self.sampler)
self.rejection_sampler = AscendRejectionSampler()
else:
self.rejection_sampler = AscendRejectionSampler(
self.sampler)
self.actual_seq_lengths_q = list( self.actual_seq_lengths_q = list(
range(self.decode_token_per_req, self.max_num_tokens + 1, range(self.decode_token_per_req, self.max_num_tokens + 1,
self.decode_token_per_req)) self.decode_token_per_req))
@@ -664,10 +632,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes, # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
# the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512). # the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
if self.compilation_config.cudagraph_capture_sizes: if self.compilation_config.cudagraph_capture_sizes:
if vllm_version_is("0.11.0"):
max_num_tokens = self.compilation_config.cudagraph_capture_sizes[
0]
else:
max_num_tokens = self.compilation_config.max_cudagraph_capture_size max_num_tokens = self.compilation_config.max_cudagraph_capture_size
else: else:
# NOTE: To save memory, we cap the max number of tokens to 512. # NOTE: To save memory, we cap the max number of tokens to 512.
@@ -717,9 +681,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.input_batch.num_accepted_tokens_cpu[i] = num_tokens self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
def _use_aclgraph(self) -> bool: def _use_aclgraph(self) -> bool:
if vllm_version_is("0.11.0"):
return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager
else:
return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.mode == CompilationMode.VLLM_COMPILE and not self.model_config.enforce_eager return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.mode == CompilationMode.VLLM_COMPILE and not self.model_config.enforce_eager
def _update_states(self, scheduler_output: "SchedulerOutput") -> None: def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
@@ -914,18 +875,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
if mm_input.get("use_audio_in_video") is True: if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True use_audio_in_video = True
if vllm_version_is("0.11.0"):
req_state.mrope_positions, req_state.mrope_position_delta = \
MRotaryEmbedding.get_input_positions_tensor(
req_state.prompt_token_ids,
hf_config=self.model_config.hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
)
else:
if supports_mrope(self.model): if supports_mrope(self.model):
req_state.mrope_positions, req_state.mrope_position_delta = \ req_state.mrope_positions, req_state.mrope_position_delta = \
self.model.get_mrope_input_positions( self.model.get_mrope_input_positions(
@@ -1108,14 +1057,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
scheduler_output) scheduler_output)
encoder_outputs = [] encoder_outputs = []
if vllm_version_is("0.11.0"):
mm_inputs = group_mm_kwargs_by_modality(
mm_kwargs,
device=self.device,
pin_memory=self.pin_memory,
)
else:
model = cast(SupportsMultiModal, self.model) model = cast(SupportsMultiModal, self.model)
mm_inputs = group_mm_kwargs_by_modality( mm_inputs = group_mm_kwargs_by_modality(
mm_kwargs, mm_kwargs,
@@ -1181,56 +1122,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
return mm_kwargs, mm_hashes_pos return mm_kwargs, mm_hashes_pos
def _gather_mm_embeddings_0110(
self,
scheduler_output: "SchedulerOutput",
) -> list[torch.Tensor]:
def _iter_mm_features(req_state: CachedRequestState):
assert req_state.mm_features is not None
for mm_feature in req_state.mm_features:
pos_info = mm_feature.mm_position
yield mm_feature.identifier, pos_info, getattr(
pos_info, "is_embed", None)
mm_embeds: list[torch.Tensor] = []
for req_id in self.input_batch.req_ids:
num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
req_id]
req_state = self.requests[req_id]
num_computed_tokens = req_state.num_computed_tokens
for mm_hash, pos_info, is_embed in _iter_mm_features(req_state):
start_pos = pos_info.offset
num_encoder_tokens = pos_info.length
if start_pos >= num_computed_tokens + num_scheduled_tokens:
break
if start_pos + num_encoder_tokens <= num_computed_tokens:
continue
start_idx = max(num_computed_tokens - start_pos, 0)
end_idx = min(
num_computed_tokens - start_pos + num_scheduled_tokens,
num_encoder_tokens,
)
assert start_idx < end_idx
encoder_output = self.encoder_cache.get(mm_hash, None)
assert encoder_output is not None, \
f"Encoder cache miss for {mm_hash}."
if is_embed is not None:
is_embed = is_embed[start_idx:end_idx]
mm_embeds_item = gather_mm_placeholders(
encoder_output[start_idx:end_idx],
is_embed=is_embed,
)
mm_embeds.append(mm_embeds_item)
return mm_embeds
def _gather_mm_embeddings( def _gather_mm_embeddings(
self, self,
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
@@ -1730,14 +1621,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# embeddings), we always use embeddings (rather than token ids) # embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text. # as input to the multimodal model, even when the input is text.
input_ids = self.input_ids[:total_num_scheduled_tokens] input_ids = self.input_ids[:total_num_scheduled_tokens]
if vllm_version_is("0.11.0"):
mm_embeds = self._gather_mm_embeddings_0110(scheduler_output)
if mm_embeds:
inputs_embeds = self.model.get_input_embeddings(
input_ids, mm_embeds)
else:
inputs_embeds = self.model.get_input_embeddings(input_ids)
else:
mm_embeds, is_mm_embed = self._gather_mm_embeddings( mm_embeds, is_mm_embed = self._gather_mm_embeddings(
scheduler_output) scheduler_output)
@@ -2151,7 +2034,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# TODO: Optimize the CPU -> NPU copy. # TODO: Optimize the CPU -> NPU copy.
cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
self.device, non_blocking=True) self.device, non_blocking=True)
if not vllm_version_is("0.11.0"):
cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
self.device, non_blocking=True) self.device, non_blocking=True)
logits_indices = torch.from_numpy(logits_indices).to(self.device, logits_indices = torch.from_numpy(logits_indices).to(self.device,
@@ -2167,16 +2049,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
draft_token_ids = draft_token_ids[target_logits_indices + 1] draft_token_ids = draft_token_ids[target_logits_indices + 1]
if self.pcp_size > 1: if self.pcp_size > 1:
logits_indices = logits_indices_pcp logits_indices = logits_indices_pcp
if vllm_version_is("0.11.0"):
metadata = SpecDecodeMetadata(
draft_token_ids=draft_token_ids,
num_draft_tokens=num_draft_tokens.tolist(),
cu_num_draft_tokens=cu_num_draft_tokens,
target_logits_indices=target_logits_indices,
bonus_logits_indices=bonus_logits_indices,
logits_indices=logits_indices,
)
else:
metadata = SpecDecodeMetadata( metadata = SpecDecodeMetadata(
draft_token_ids=draft_token_ids, draft_token_ids=draft_token_ids,
num_draft_tokens=num_draft_tokens.tolist(), num_draft_tokens=num_draft_tokens.tolist(),
@@ -2222,31 +2094,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
shape=(logits.shape[0], shape=(logits.shape[0],
grammar_bitmask.shape[1])) grammar_bitmask.shape[1]))
cumulative_index = 0 cumulative_index = 0
if vllm_version_is("0.11.0"):
seq = sorted(
scheduler_output.structured_output_request_ids.items(),
key=lambda x: x[1])
for req_id, _ in seq:
logit_index = struct_out_req_batch_indices[req_id]
num_spec_tokens = len(
scheduler_output.scheduled_spec_decode_tokens.get(
req_id, []))
for i in range(1 + num_spec_tokens):
sorted_bitmask[logit_index + i] = \
grammar_bitmask[cumulative_index + i]
out_indices.append(logit_index + i)
cumulative_index += 1 + num_spec_tokens
else:
for req_id in scheduler_output.structured_output_request_ids: for req_id in scheduler_output.structured_output_request_ids:
num_spec_tokens = len( num_spec_tokens = len(
scheduler_output.scheduled_spec_decode_tokens.get( scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
req_id, []))
if req_id in struct_out_req_batch_indices: if req_id in struct_out_req_batch_indices:
logit_index = struct_out_req_batch_indices[req_id] logit_index = struct_out_req_batch_indices[req_id]
for i in range(1 + num_spec_tokens): for i in range(1 + num_spec_tokens):
sorted_bitmask[logit_index + sorted_bitmask[logit_index +
i] = grammar_bitmask[cumulative_index + i] = grammar_bitmask[cumulative_index + i]
i]
out_indices.append(logit_index + i) out_indices.append(logit_index + i)
cumulative_index += 1 + num_spec_tokens cumulative_index += 1 + num_spec_tokens
grammar_bitmask = sorted_bitmask grammar_bitmask = sorted_bitmask
@@ -2518,14 +2373,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
logits = model_output_broadcast_data["logits"] logits = model_output_broadcast_data["logits"]
# Apply structured output bitmasks if present # Apply structured output bitmasks if present
if vllm_version_is("0.11.0"):
if scheduler_output.grammar_bitmask is not None:
logits = self.apply_grammar_bitmask(
scheduler_output, logits)
else:
if scheduler_output.structured_output_request_ids: if scheduler_output.structured_output_request_ids:
logits = self.apply_grammar_bitmask( logits = self.apply_grammar_bitmask(scheduler_output, logits)
scheduler_output, logits)
with ProfileExecuteDuration().capture_async("Sample"): with ProfileExecuteDuration().capture_async("Sample"):
# Sample the next token and get logprobs if needed. # Sample the next token and get logprobs if needed.
@@ -3837,95 +3686,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else: else:
self.reorder_batch_threshold = reorder_batch_threshold_i self.reorder_batch_threshold = reorder_batch_threshold_i
def get_kv_cache_spec_v0110(self) -> dict[str, KVCacheSpec]:
"""
Generates the KVCacheSpec by parsing the kv cache format from each
Attention module in the static forward context.
Returns:
KVCacheSpec: A dictionary mapping layer names to their KV cache
format. Layers that do not need KV cache are not included.
"""
block_size = self.vllm_config.cache_config.block_size
use_mla = self.vllm_config.model_config.use_mla
use_sparse = self.use_sparse
kv_cache_spec: dict[str, KVCacheSpec] = {}
attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
for layer_name, attn_module in attn_layers.items():
if (kv_tgt_layer :=
attn_module.kv_sharing_target_layer_name) is not None:
# The layer doesn't need its own KV cache and will use that of
# the target layer. We skip creating a KVCacheSpec for it, so
# that KV cache management logic will act as this layer does
# not exist, and doesn't allocate KV cache for the layer. This
# enables the memory saving of cross-layer kv sharing, allowing
# a given amount of memory to accommodate longer context lengths
# or enable more requests to be processed simultaneously.
self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
continue
if isinstance(attn_module, AscendMultiHeadLatentAttention):
continue
# TODO: Support other attention modules, e.g., cross-attention
# TODO(lucas): move the attention specs into the model layers like
# the attention backends
if attn_module.attn_type == AttentionType.DECODER:
if use_mla and not use_sparse:
kv_cache_spec[layer_name] = MLAAttentionSpec(
block_size=block_size,
num_kv_heads=attn_module.num_kv_heads,
head_size=attn_module.head_size,
dtype=self.kv_cache_dtype,
cache_dtype_str=self.cache_config.cache_dtype)
else:
# TODO(cmq): This is a hack way to fix deepseek kvcache when
# using DSA. Fix the spec in vLLM is a finnal way.
kv_cache_spec[layer_name] = FullAttentionSpec(
block_size=block_size,
num_kv_heads=attn_module.num_kv_heads,
head_size=attn_module.head_size,
dtype=self.kv_cache_dtype)
elif attn_module.attn_type in (AttentionType.ENCODER,
AttentionType.ENCODER_ONLY):
# encoder-only attention does not need KV cache.
continue
elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
raise NotImplementedError
else:
raise ValueError(
f"Unknown attention type: {attn_module.attn_type}")
mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
if len(mamba_layers) > 0:
if (self.vllm_config.speculative_config is not None
and self.vllm_config.model_config.hf_config.model_type
not in ["qwen3_next"]):
raise NotImplementedError(
"Mamba with speculative decoding is not supported yet.")
if self.vllm_config.cache_config.enable_prefix_caching:
raise NotImplementedError(
"Prefix caching is not supported for Mamba yet.")
max_model_len = self.vllm_config.model_config.max_model_len
page_size_padded = (
self.vllm_config.cache_config.mamba_page_size_padded)
# Set block_size to max_model_len, so that mamba model will always
# have only one block in the KV cache.
for layer_name, mamba_module in mamba_layers.items():
kv_cache_spec[layer_name] = MambaSpec(
shapes=mamba_module.get_state_shape(),
dtypes=mamba_module.get_state_dtype(),
block_size=max_model_len,
page_size_padded=page_size_padded,
mamba_type=mamba_module.mamba_type,
num_speculative_blocks=(
self.speculative_config.num_speculative_tokens
if self.speculative_config else 0),
)
return kv_cache_spec
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
""" """
Generates the KVCacheSpec by parsing the kv cache format from each Generates the KVCacheSpec by parsing the kv cache format from each
@@ -3934,9 +3694,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
KVCacheSpec: A dictionary mapping layer names to their KV cache KVCacheSpec: A dictionary mapping layer names to their KV cache
format. Layers that do not need KV cache are not included. format. Layers that do not need KV cache are not included.
""" """
if vllm_version_is("0.11.0"):
return self.get_kv_cache_spec_v0110()
block_size = self.vllm_config.cache_config.block_size block_size = self.vllm_config.cache_config.block_size
use_mla = self.vllm_config.model_config.use_mla use_mla = self.vllm_config.model_config.use_mla
kv_cache_spec: dict[str, KVCacheSpec] = {} kv_cache_spec: dict[str, KVCacheSpec] = {}

View File

@@ -30,6 +30,7 @@ from vllm.multimodal.inputs import (MultiModalFeatureSpec,
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams, SamplingType from vllm.sampling_params import SamplingParams, SamplingType
from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.collection_utils import swap_dict_values
from vllm.v1.outputs import LogprobsTensors from vllm.v1.outputs import LogprobsTensors
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
@@ -39,14 +40,8 @@ from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
from vllm.v1.utils import copy_slice from vllm.v1.utils import copy_slice
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import MultiGroupBlockTable from vllm_ascend.worker.block_table import MultiGroupBlockTable
if vllm_version_is("0.11.0"):
from vllm.utils import swap_dict_values
else:
from vllm.utils.collection_utils import swap_dict_values
@dataclass @dataclass
class CachedRequestState: class CachedRequestState:

View File

@@ -35,6 +35,8 @@ from vllm.logger import logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
@@ -50,7 +52,7 @@ from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz, from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz,
prefill_context_parallel_enable, prefill_context_parallel_enable,
register_ascend_customop, sleep_mode_enabled, register_ascend_customop, sleep_mode_enabled,
try_register_lib, vllm_version_is) try_register_lib)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
@@ -65,12 +67,6 @@ torch_non_c_binding_in_graph_functions_npu[
torch._dynamo.trace_rules.torch_name_rule_map.append( torch._dynamo.trace_rules.torch_name_rule_map.append(
torch_non_c_binding_in_graph_functions_npu) # noqa: E402 torch_non_c_binding_in_graph_functions_npu) # noqa: E402
if vllm_version_is("0.11.0"):
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
else:
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
class NPUWorker(WorkerBase): class NPUWorker(WorkerBase):
@@ -141,9 +137,6 @@ class NPUWorker(WorkerBase):
if self.model_config.trust_remote_code: if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing # note: lazy import to avoid importing torch before initializing
if vllm_version_is("0.11.0"):
from vllm.utils import init_cached_hf_modules
else:
from vllm.utils.import_utils import init_cached_hf_modules from vllm.utils.import_utils import init_cached_hf_modules
init_cached_hf_modules() init_cached_hf_modules()