From b0403f8d8a5ced0ddb86722754bc515f4df9d1f1 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 22 Aug 2025 07:30:48 +0800 Subject: [PATCH] [CI] fix ci (#2464) ### What this PR does / why we need it? 1. use action/checkout@v5 instead of v4 2. remove dbo test case because there is issue with it and will be refactored later 3. make vllm-ascend compatible with vllm v0.10.1.1 and add CI for it 4. fix sampler api changes introduced by https://github.com/vllm-project/vllm/pull/22387 6. fix qwen3 moe config changes intruoduced by https://github.com/vllm-project/vllm/pull/20562 7. fix kvcache block changes introduced by https://github.com/vllm-project/vllm/pull/23262 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/0c6e40bbaa4707528286a1e7bf17c90c88a1d920 --------- Signed-off-by: MengqingCao --- .github/workflows/accuracy_test.yaml | 8 +- .github/workflows/format_pr_body.yaml | 2 +- .github/workflows/image_310p_openeuler.yml | 2 +- .github/workflows/image_310p_ubuntu.yml | 2 +- .github/workflows/image_a3_openeuler.yml | 2 +- .github/workflows/image_a3_ubuntu.yml | 2 +- .github/workflows/image_openeuler.yml | 2 +- .github/workflows/image_ubuntu.yml | 2 +- .github/workflows/nightly_benchmarks.yaml | 4 +- .github/workflows/pre-commit.yml | 4 +- .github/workflows/vllm_ascend_doctest.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 11 +- .github/workflows/vllm_ascend_test_310p.yaml | 6 +- .github/workflows/vllm_ascend_test_pd.yaml | 4 +- Dockerfile | 2 +- Dockerfile.310p | 2 +- Dockerfile.310p.openEuler | 2 +- Dockerfile.a3 | 2 +- Dockerfile.a3.openEuler | 2 +- Dockerfile.openEuler | 2 +- .../test_offline_inference_distributed.py | 20 -- tests/ut/core/test_scheduler.py | 294 ++++++++++++------ tests/ut/kv_connector/utils.py | 32 +- vllm_ascend/core/scheduler.py | 55 +++- vllm_ascend/models/qwen3_moe.py | 7 +- vllm_ascend/sample/sampler.py | 27 +- vllm_ascend/worker/model_runner_v1.py | 88 ++++-- 27 files changed, 389 insertions(+), 199 deletions(-) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 1f0350d..044c5dc 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -88,7 +88,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Set model name as output id: set_output @@ -109,7 +109,7 @@ jobs: apt-get -y install gcc g++ cmake libnuma-dev - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: v0.10.0 @@ -138,7 +138,7 @@ jobs: echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm-ascend path: ./vllm-ascend @@ -236,7 +236,7 @@ jobs: UPSTREAM_REPO: vllm-project/vllm-ascend steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-ascend-ci/vllm-ascend token: ${{ secrets.PAT_TOKEN }} diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index e50395c..dedf7a4 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm path: ./vllm-empty diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml index 9339c9e..e6062a8 100644 --- a/.github/workflows/image_310p_openeuler.yml +++ b/.github/workflows/image_310p_openeuler.yml @@ -53,7 +53,7 @@ jobs: 'ubuntu-24.04-arm' }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml index 86ca73f..0e9444f 100644 --- a/.github/workflows/image_310p_ubuntu.yml +++ b/.github/workflows/image_310p_ubuntu.yml @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_a3_openeuler.yml b/.github/workflows/image_a3_openeuler.yml index 3eda8dd..a10ad1c 100644 --- a/.github/workflows/image_a3_openeuler.yml +++ b/.github/workflows/image_a3_openeuler.yml @@ -53,7 +53,7 @@ jobs: 'ubuntu-24.04-arm' }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_a3_ubuntu.yml b/.github/workflows/image_a3_ubuntu.yml index 7a6506c..6116015 100644 --- a/.github/workflows/image_a3_ubuntu.yml +++ b/.github/workflows/image_a3_ubuntu.yml @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index 22ea1dc..d4e69a5 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -52,7 +52,7 @@ jobs: 'ubuntu-24.04-arm' }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml index b708750..1c2ddcd 100644 --- a/.github/workflows/image_ubuntu.yml +++ b/.github/workflows/image_ubuntu.yml @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 64dadf2..8a43481 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -97,12 +97,12 @@ jobs: git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm path: ./vllm-empty diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 1564bd7..e41dd6e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -11,14 +11,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: "3.11" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm path: ./vllm-empty diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml index ffb552f..1b4faea 100644 --- a/.github/workflows/vllm_ascend_doctest.yaml +++ b/.github/workflows/vllm_ascend_doctest.yaml @@ -66,7 +66,7 @@ jobs: git --no-pager log -1 || true - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Run vllm-ascend/tests/e2e/run_doctests.sh run: | diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 7dbcb18..78cfefa 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -81,7 +81,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [main] + vllm_version: [v0.10.1.1, main] steps: - name: Install packages run: | @@ -89,7 +89,7 @@ jobs: apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: python3 -m pip uninstall -y triton - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Install vllm-project/vllm-ascend run: | @@ -137,7 +137,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-a2-1] - vllm_version: [main] + vllm_version: [v0.10.1.1, main] name: singlecard e2e test runs-on: ${{ matrix.os }} container: @@ -219,7 +219,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-a2-2] - vllm_version: [main] + vllm_version: [v0.10.1.1, main] name: multicard e2e test runs-on: ${{ matrix.os }} container: @@ -278,7 +278,6 @@ jobs: # To avoid oom, we need to run the test in a single process. pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml index 52d6dde..9d4a970 100644 --- a/.github/workflows/vllm_ascend_test_310p.yaml +++ b/.github/workflows/vllm_ascend_test_310p.yaml @@ -53,7 +53,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-310p-1, linux-aarch64-310p-4] - vllm_version: [main] + vllm_version: [v0.10.1.1, main] name: 310p e2e test runs-on: ${{ matrix.os }} container: @@ -77,7 +77,7 @@ jobs: apt install git -y - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Install system dependencies run: | @@ -85,7 +85,7 @@ jobs: apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: ${{ matrix.vllm_version }} diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 2f21365..a86ba60 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -80,7 +80,7 @@ jobs: git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Install system dependencies run: | @@ -88,7 +88,7 @@ jobs: apt-get -y install gcc g++ cmake libnuma-dev - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: ${{ matrix.vllm_verison }} diff --git a/Dockerfile b/Dockerfile index a12df1e..29d6445 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 299624c..4eb3c63 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index ff7ec05..0e76ba3 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/Dockerfile.a3 b/Dockerfile.a3 index da1efcc..8bdfb0e 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index b03851c..afaf11d 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 1146d0a..b744b33 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index e869c2d..f7354ab 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -78,26 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe(): vllm_model.generate_greedy(example_prompts, max_tokens) -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"}) -def test_models_distributed_DeepSeek_dbo(): - example_prompts = ["The president of the United States is"] * 41 - dtype = "half" - sampling_params = SamplingParams(max_tokens=100, temperature=0.0) - with VllmRunner( - "deepseek-ai/DeepSeek-V2-Lite", - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend="mp", - ) as vllm_model: - model_arch = 'DeepseekV2ForCausalLM' - registed_models = ModelRegistry.models - assert registed_models[ - model_arch].module_name == "vllm_ascend.models.deepseek_dbo" - assert registed_models[ - model_arch].class_name == "CustomDeepseekDBOForCausalLM" - vllm_model.generate(example_prompts, sampling_params) - - @pytest.mark.skip( reason= "deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it" diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py index 78b0c65..6680a25 100644 --- a/tests/ut/core/test_scheduler.py +++ b/tests/ut/core/test_scheduler.py @@ -13,7 +13,7 @@ from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec) -from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput +from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager @@ -21,6 +21,11 @@ from tests.ut.base import TestBase from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.utils import vllm_version_is +if not vllm_version_is("0.10.1.1"): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + EOS_TOKEN_ID = 50256 MODEL = "Qwen3-0.6B" ENABLE_PREFIX_CACHING = None @@ -66,16 +71,33 @@ def create_requests( def make_output(scheduler): - return ModelRunnerOutput( - req_ids=[req.request_id for req in scheduler.running], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(scheduler.running) - }, - sampled_token_ids=[[1000]] * len(scheduler.running), - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + req_ids = [req.request_id for req in scheduler.running] + req_id_to_index = { + req.request_id: i + for i, req in enumerate(scheduler.running) + } + sampled_token_ids = [[1000]] * len(scheduler.running) + logprobs = None + if vllm_version_is("0.10.1.1"): + modelrunner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=logprobs, + prompt_logprobs_dict={}, + pooler_output=[], + ) + else: + modelrunner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprobs=logprobs, + prompt_logprobs_dict={}, + pooler_output=[], + ) + return modelrunner_output class TestAscendScheduler(TestBase): @@ -271,8 +293,7 @@ class TestAscendScheduler(TestBase): req.num_computed_tokens = req.num_tokens scheduler.requests[req.request_id] = req scheduler.running.append(req) - if not vllm_version_is("0.9.2"): - req.status = RequestStatus.RUNNING + req.status = RequestStatus.RUNNING scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -291,18 +312,33 @@ class TestAscendScheduler(TestBase): free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=None) - - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(requests) - }, - sampled_token_ids=[[EOS_TOKEN_ID], [10, 11] - ], # First request hits EOS, second continues - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[EOS_TOKEN_ID], [ + 10, 11 + ]], # First request hits EOS, second continues + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[EOS_TOKEN_ID], [ + 10, 11 + ]], # First request hits EOS, second continues + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -325,8 +361,7 @@ class TestAscendScheduler(TestBase): req.num_computed_tokens = req.num_tokens scheduler.requests[req.request_id] = req scheduler.running.append(req) - if not vllm_version_is("0.9.2"): - req.status = RequestStatus.RUNNING + req.status = RequestStatus.RUNNING scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -346,18 +381,31 @@ class TestAscendScheduler(TestBase): free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=None) - - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(requests) - }, - sampled_token_ids=[[10, 42, 12], - [13, 14]], # First request hits stop token - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -379,8 +427,7 @@ class TestAscendScheduler(TestBase): req.num_computed_tokens = req.num_tokens scheduler.requests[req.request_id] = req scheduler.running.append(req) - if not vllm_version_is("0.9.2"): - req.status = RequestStatus.RUNNING + req.status = RequestStatus.RUNNING scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -401,18 +448,31 @@ class TestAscendScheduler(TestBase): structured_output_request_ids={}, grammar_bitmask=None) - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(requests) - }, - sampled_token_ids=[[10, 11, 12], - [13]], # First request exceeds max_tokens - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) - + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) # Verify first request stopped due to length @@ -448,13 +508,24 @@ class TestAscendScheduler(TestBase): structured_output_request_ids={}, grammar_bitmask=None) - model_output = ModelRunnerOutput( - req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + else: + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -505,13 +576,23 @@ class TestAscendScheduler(TestBase): 512) # Model output of the first request. - model_runner_output = ModelRunnerOutput( - req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[0]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output0, model_runner_output) @@ -521,13 +602,23 @@ class TestAscendScheduler(TestBase): # request is still running. scheduler.schedule() # Model output of the second request. - model_runner_output = ModelRunnerOutput( - req_ids=[requests[1].request_id], - req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[[0]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output1, model_runner_output) @@ -579,19 +670,29 @@ class TestAscendScheduler(TestBase): req_id = requests[i].request_id self.assertEqual(output.num_scheduled_tokens[req_id], 1) self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) - - model_runner_output = ModelRunnerOutput( - req_ids=req_ids, - req_id_to_index=req_to_index, - sampled_token_ids=[[0] for _ in range(len(requests))], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) - draft_token_ids = DraftTokenIds(req_ids, spec_tokens) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + logprobs=None, + prompt_logprobs_dict={}, + spec_token_ids=spec_tokens, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + draft_token_ids = DraftTokenIds(req_ids, spec_tokens) engine_core_outputs = scheduler.update_from_output( output, model_runner_output) - scheduler.update_draft_token_ids(draft_token_ids) + if not vllm_version_is("0.10.1.1"): + scheduler.update_draft_token_ids(draft_token_ids) for i in range(len(requests)): running_req = scheduler.running[i] @@ -627,14 +728,23 @@ class TestAscendScheduler(TestBase): else: self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) - - model_runner_output = ModelRunnerOutput( - req_ids=req_ids, - req_id_to_index=req_to_index, - sampled_token_ids=output_tokens, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) engine_core_outputs = scheduler.update_from_output( output, model_runner_output) diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 9dc6dfc..c2e0a1f 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -200,12 +200,26 @@ def create_model_runner_output( kv_connector_output = KVConnectorOutput(finished_sending=finished_sending, finished_recving=finished_recving) extra_args = {"kv_connector_output": kv_connector_output} - return ModelRunnerOutput( - req_ids=req_ids, - req_id_to_index=req_id_to_index, - sampled_token_ids=sampled_token_ids, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[], - **extra_args, - ) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + **extra_args, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + **extra_args, + ) + + return model_runner_output diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index dfdc9aa..627d5ea 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -31,6 +31,13 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.10.1.1"): + from vllm.v1.core.kv_cache_manager import KVCacheBlocks +else: + KVCacheBlocks = None + class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -59,7 +66,10 @@ class AscendScheduler(Scheduler): scheduled_running_reqs: list[Request] = [] preempted_reqs: list[Request] = [] - req_to_new_block_ids: dict[str, list[int]] = {} + if vllm_version_is("0.10.1.1"): + req_to_new_block_ids: dict[str, list[int]] = {} + else: + req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Spec decode-related. @@ -217,8 +227,11 @@ class AscendScheduler(Scheduler): if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = ( - self.kv_cache_manager.get_block_ids(request.request_id)) + if vllm_version_is("0.10.1.1"): + req_to_new_block_ids[request.request_id] = ( + self.kv_cache_manager.get_block_ids(request.request_id)) + else: + req_to_new_blocks[request.request_id] = new_blocks # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens @@ -307,8 +320,11 @@ class AscendScheduler(Scheduler): # Schedule the request. scheduled_running_reqs.append(request) self.scheduled_req_ids.add(request.request_id) - req_to_new_block_ids[request.request_id] = ( - new_blocks.get_block_ids()) + if vllm_version_is("0.10.1.1"): + req_to_new_block_ids[request.request_id] = ( + new_blocks.get_block_ids()) + else: + req_to_new_blocks[request.request_id] = new_blocks num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -346,16 +362,27 @@ class AscendScheduler(Scheduler): any_request, len(self.running))) # Construct the scheduler output. - new_reqs_data = [ - NewRequestData.from_request(req, - req_to_new_block_ids[req.request_id]) - for req in scheduled_new_reqs - ] + if vllm_version_is("0.10.1.1"): + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_block_ids[req.request_id]) + for req in scheduled_new_reqs + ] + cached_reqs_data = self._make_cached_request_data( + scheduled_running_reqs, scheduled_resumed_reqs, + num_scheduled_tokens, scheduled_spec_decode_tokens, + req_to_new_block_ids) + else: + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids()) + for req in scheduled_new_reqs + ] - cached_reqs_data = self._make_cached_request_data( - scheduled_running_reqs, scheduled_resumed_reqs, - num_scheduled_tokens, scheduled_spec_decode_tokens, - req_to_new_block_ids) + cached_reqs_data = self._make_cached_request_data( + scheduled_running_reqs, scheduled_resumed_reqs, + num_scheduled_tokens, scheduled_spec_decode_tokens, + req_to_new_blocks) scheduled_cached_reqs = cached_reqs_data scheduler_output = SchedulerOutput( diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index 29ab675..0df8377 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -50,6 +50,7 @@ from vllm.sequence import IntermediateTensors from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.ops.sequence_parallel import (MetadataForPadding, init_metadata_for_sp) +from vllm_ascend.utils import vllm_version_is class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): @@ -253,7 +254,11 @@ class CustomQwen3MoeModel(Qwen3MoeModel): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config - self.num_redundant_experts = parallel_config.num_redundant_experts + if vllm_version_is("0.10.1.1"): + self.num_redundant_experts = parallel_config.num_redundant_experts + else: + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index c082f98..d3f1ae9 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -3,12 +3,19 @@ import torch_npu from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.sampler import Sampler -from vllm_ascend.utils import is_310p +from vllm_ascend.utils import is_310p, vllm_version_is + +if not vllm_version_is("0.10.1.1"): + from vllm.config import LogprobsMode + DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS +else: + LogprobsMode = None + DEFAULT_LOGPROBS_MODE = "raw_logprobs" class AscendSampler(Sampler): - def __init__(self, logprobs_mode="raw_logprobs"): + def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE): # TODO: support logprobs_mode in vllm-ascend super().__init__(logprobs_mode=logprobs_mode) self.topk_topp_sampler = AscendTopKTopPSampler() @@ -61,5 +68,19 @@ class AscendTopKTopPSampler(TopKTopPSampler): def forward_native(self, logits, generators, k, p): """Override pytorch native implementation to torch_npu""" logits = self._apply_top_k_top_p(logits, k, p) + if not vllm_version_is("0.10.1.1"): + + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, + dtype=torch.float32) + probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + output = None + if vllm_version_is("0.10.1.1"): + output = random_sample(probs, generators) + else: + output = (random_sample(probs, generators), logits_to_return) + return output diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d250055..859f21e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -64,8 +64,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) -from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds, - LogprobsTensors, ModelRunnerOutput) +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, + ModelRunnerOutput) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata @@ -95,11 +95,17 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, ProfileExecuteDuration, is_310p, - maybe_converting_weight_acl_format) + maybe_converting_weight_acl_format, + vllm_version_is) from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch +if not vllm_version_is("0.10.1.1"): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] from vllm.v1.core.sched.output import SchedulerOutput @@ -514,11 +520,13 @@ class NPUModelRunner(LoRAModelRunnerMixin): # Update the block IDs. if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -534,7 +542,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # For the last rank, we don't need to update the token_ids_cpu # because the sampled tokens are already cached. @@ -1526,16 +1536,28 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: pooler_output.append(None) extra_args = ({"kv_connector_output": kv_connector_output}) - - return ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=[], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=pooler_output, - **extra_args, - ) + if vllm_version_is("0.10.1.1"): + modelrunner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + **extra_args, + ) + else: + modelrunner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + **extra_args, + ) + return modelrunner_output @torch.inference_mode() def execute_model( @@ -1757,15 +1779,27 @@ class NPUModelRunner(LoRAModelRunnerMixin): extra_args = ({"kv_connector_output": kv_connector_output}) - model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=valid_sampled_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict=prompt_logprobs_dict, - pooler_output=[], - **extra_args, - ) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + spec_token_ids=self._draft_token_ids, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + **extra_args, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + **extra_args, + ) durations = ProfileExecuteDuration().pop_captured_sync() if durations: