[CI] fix ci (#2464)

### What this PR does / why we need it?
1. use action/checkout@v5 instead of v4
2. remove dbo test case because there is issue with it and will be
refactored later
3. make vllm-ascend compatible with vllm v0.10.1.1 and add CI for it
4. fix sampler api changes introduced by
https://github.com/vllm-project/vllm/pull/22387
6. fix qwen3 moe config changes intruoduced by
https://github.com/vllm-project/vllm/pull/20562
7. fix kvcache block changes introduced by
https://github.com/vllm-project/vllm/pull/23262

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with existing test.


- vLLM version: v0.10.0
- vLLM main:
0c6e40bbaa

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-08-22 07:30:48 +08:00
committed by GitHub
parent 0ca3f48c90
commit b0403f8d8a
27 changed files with 389 additions and 199 deletions

View File

@@ -88,7 +88,7 @@ jobs:
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v5 uses: actions/checkout@v4
- name: Set model name as output - name: Set model name as output
id: set_output id: set_output
@@ -109,7 +109,7 @@ jobs:
apt-get -y install gcc g++ cmake libnuma-dev apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: v0.10.0 ref: v0.10.0
@@ -138,7 +138,7 @@ jobs:
echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm-ascend repository: vllm-project/vllm-ascend
path: ./vllm-ascend path: ./vllm-ascend
@@ -236,7 +236,7 @@ jobs:
UPSTREAM_REPO: vllm-project/vllm-ascend UPSTREAM_REPO: vllm-project/vllm-ascend
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-ascend-ci/vllm-ascend repository: vllm-ascend-ci/vllm-ascend
token: ${{ secrets.PAT_TOKEN }} token: ${{ secrets.PAT_TOKEN }}

View File

@@ -34,7 +34,7 @@ jobs:
steps: steps:
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
path: ./vllm-empty path: ./vllm-empty

View File

@@ -53,7 +53,7 @@ jobs:
'ubuntu-24.04-arm' 'ubuntu-24.04-arm'
}} }}
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
- name: Print - name: Print
run: | run: |

View File

@@ -49,7 +49,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
- name: Print - name: Print
run: | run: |

View File

@@ -53,7 +53,7 @@ jobs:
'ubuntu-24.04-arm' 'ubuntu-24.04-arm'
}} }}
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
- name: Print - name: Print
run: | run: |

View File

@@ -49,7 +49,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
- name: Print - name: Print
run: | run: |

View File

@@ -52,7 +52,7 @@ jobs:
'ubuntu-24.04-arm' 'ubuntu-24.04-arm'
}} }}
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
- name: Print - name: Print
run: | run: |

View File

@@ -49,7 +49,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
- name: Print - name: Print
run: | run: |

View File

@@ -97,12 +97,12 @@ jobs:
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
path: ./vllm-empty path: ./vllm-empty

View File

@@ -11,14 +11,14 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with: with:
python-version: "3.11" python-version: "3.11"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
path: ./vllm-empty path: ./vllm-empty

View File

@@ -66,7 +66,7 @@ jobs:
git --no-pager log -1 || true git --no-pager log -1 || true
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
- name: Run vllm-ascend/tests/e2e/run_doctests.sh - name: Run vllm-ascend/tests/e2e/run_doctests.sh
run: | run: |

View File

@@ -81,7 +81,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [main] vllm_version: [v0.10.1.1, main]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -89,7 +89,7 @@ jobs:
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }} ref: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
python3 -m pip uninstall -y triton python3 -m pip uninstall -y triton
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
run: | run: |
@@ -137,7 +137,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-1] os: [linux-aarch64-a2-1]
vllm_version: [main] vllm_version: [v0.10.1.1, main]
name: singlecard e2e test name: singlecard e2e test
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -219,7 +219,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-2] os: [linux-aarch64-a2-2]
vllm_version: [main] vllm_version: [v0.10.1.1, main]
name: multicard e2e test name: multicard e2e test
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -278,7 +278,6 @@ jobs:
# To avoid oom, we need to run the test in a single process. # To avoid oom, we need to run the test in a single process.
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC

View File

@@ -53,7 +53,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-310p-1, linux-aarch64-310p-4] os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
vllm_version: [main] vllm_version: [v0.10.1.1, main]
name: 310p e2e test name: 310p e2e test
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -77,7 +77,7 @@ jobs:
apt install git -y apt install git -y
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
- name: Install system dependencies - name: Install system dependencies
run: | run: |
@@ -85,7 +85,7 @@ jobs:
apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }} ref: ${{ matrix.vllm_version }}

View File

@@ -80,7 +80,7 @@ jobs:
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo - name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v5 uses: actions/checkout@v4
- name: Install system dependencies - name: Install system dependencies
run: | run: |
@@ -88,7 +88,7 @@ jobs:
apt-get -y install gcc g++ cmake libnuma-dev apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo - name: Checkout vllm-project/vllm repo
uses: actions/checkout@v5 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: ${{ matrix.vllm_verison }} ref: ${{ matrix.vllm_verison }}

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.0 ARG VLLM_TAG=v0.10.1.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.0 ARG VLLM_TAG=v0.10.1.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.0 ARG VLLM_TAG=v0.10.1.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.0 ARG VLLM_TAG=v0.10.1.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.0 ARG VLLM_TAG=v0.10.1.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.0 ARG VLLM_TAG=v0.10.1.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -78,26 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
def test_models_distributed_DeepSeek_dbo():
example_prompts = ["The president of the United States is"] * 41
dtype = "half"
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
with VllmRunner(
"deepseek-ai/DeepSeek-V2-Lite",
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend="mp",
) as vllm_model:
model_arch = 'DeepseekV2ForCausalLM'
registed_models = ModelRegistry.models
assert registed_models[
model_arch].module_name == "vllm_ascend.models.deepseek_dbo"
assert registed_models[
model_arch].class_name == "CustomDeepseekDBOForCausalLM"
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.skip( @pytest.mark.skip(
reason= reason=
"deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it" "deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it"

View File

@@ -13,7 +13,7 @@ from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec) KVCacheGroupSpec)
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
@@ -21,6 +21,11 @@ from tests.ut.base import TestBase
from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
if not vllm_version_is("0.10.1.1"):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B" MODEL = "Qwen3-0.6B"
ENABLE_PREFIX_CACHING = None ENABLE_PREFIX_CACHING = None
@@ -66,16 +71,33 @@ def create_requests(
def make_output(scheduler): def make_output(scheduler):
return ModelRunnerOutput( req_ids = [req.request_id for req in scheduler.running]
req_ids=[req.request_id for req in scheduler.running], req_id_to_index = {
req_id_to_index={ req.request_id: i
req.request_id: i for i, req in enumerate(scheduler.running)
for i, req in enumerate(scheduler.running) }
}, sampled_token_ids = [[1000]] * len(scheduler.running)
sampled_token_ids=[[1000]] * len(scheduler.running), logprobs = None
logprobs=None, if vllm_version_is("0.10.1.1"):
prompt_logprobs_dict={}, modelrunner_output = ModelRunnerOutput(
pooler_output=[]) req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
spec_token_ids=None,
logprobs=logprobs,
prompt_logprobs_dict={},
pooler_output=[],
)
else:
modelrunner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=logprobs,
prompt_logprobs_dict={},
pooler_output=[],
)
return modelrunner_output
class TestAscendScheduler(TestBase): class TestAscendScheduler(TestBase):
@@ -271,8 +293,7 @@ class TestAscendScheduler(TestBase):
req.num_computed_tokens = req.num_tokens req.num_computed_tokens = req.num_tokens
scheduler.requests[req.request_id] = req scheduler.requests[req.request_id] = req
scheduler.running.append(req) scheduler.running.append(req)
if not vllm_version_is("0.9.2"): req.status = RequestStatus.RUNNING
req.status = RequestStatus.RUNNING
scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[], scheduled_cached_reqs=[],
@@ -291,18 +312,33 @@ class TestAscendScheduler(TestBase):
free_encoder_input_ids=[], free_encoder_input_ids=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None) grammar_bitmask=None)
if vllm_version_is("0.10.1.1"):
model_output = ModelRunnerOutput( model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests], req_ids=[req.request_id for req in requests],
req_id_to_index={ req_id_to_index={
req.request_id: i req.request_id: i
for i, req in enumerate(requests) for i, req in enumerate(requests)
}, },
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11] sampled_token_ids=[[EOS_TOKEN_ID], [
], # First request hits EOS, second continues 10, 11
logprobs=None, ]], # First request hits EOS, second continues
prompt_logprobs_dict={}, spec_token_ids=None,
pooler_output=[]) logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [
10, 11
]], # First request hits EOS, second continues
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
@@ -325,8 +361,7 @@ class TestAscendScheduler(TestBase):
req.num_computed_tokens = req.num_tokens req.num_computed_tokens = req.num_tokens
scheduler.requests[req.request_id] = req scheduler.requests[req.request_id] = req
scheduler.running.append(req) scheduler.running.append(req)
if not vllm_version_is("0.9.2"): req.status = RequestStatus.RUNNING
req.status = RequestStatus.RUNNING
scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[], scheduled_cached_reqs=[],
@@ -346,18 +381,31 @@ class TestAscendScheduler(TestBase):
free_encoder_input_ids=[], free_encoder_input_ids=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None) grammar_bitmask=None)
if vllm_version_is("0.10.1.1"):
model_output = ModelRunnerOutput( model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests], req_ids=[req.request_id for req in requests],
req_id_to_index={ req_id_to_index={
req.request_id: i req.request_id: i
for i, req in enumerate(requests) for i, req in enumerate(requests)
}, },
sampled_token_ids=[[10, 42, 12], sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token [13, 14]], # First request hits stop token
logprobs=None, spec_token_ids=None,
prompt_logprobs_dict={}, logprobs=None,
pooler_output=[]) prompt_logprobs_dict={},
pooler_output=[])
else:
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
@@ -379,8 +427,7 @@ class TestAscendScheduler(TestBase):
req.num_computed_tokens = req.num_tokens req.num_computed_tokens = req.num_tokens
scheduler.requests[req.request_id] = req scheduler.requests[req.request_id] = req
scheduler.running.append(req) scheduler.running.append(req)
if not vllm_version_is("0.9.2"): req.status = RequestStatus.RUNNING
req.status = RequestStatus.RUNNING
scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[], scheduled_cached_reqs=[],
@@ -401,18 +448,31 @@ class TestAscendScheduler(TestBase):
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None) grammar_bitmask=None)
model_output = ModelRunnerOutput( if vllm_version_is("0.10.1.1"):
req_ids=[req.request_id for req in requests], model_output = ModelRunnerOutput(
req_id_to_index={ req_ids=[req.request_id for req in requests],
req.request_id: i req_id_to_index={
for i, req in enumerate(requests) req.request_id: i
}, for i, req in enumerate(requests)
sampled_token_ids=[[10, 11, 12], },
[13]], # First request exceeds max_tokens sampled_token_ids=[[10, 11, 12],
logprobs=None, [13]], # First request exceeds max_tokens
prompt_logprobs_dict={}, spec_token_ids=None,
pooler_output=[]) logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
# Verify first request stopped due to length # Verify first request stopped due to length
@@ -448,13 +508,24 @@ class TestAscendScheduler(TestBase):
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None) grammar_bitmask=None)
model_output = ModelRunnerOutput( if vllm_version_is("0.10.1.1"):
req_ids=[requests[0].request_id], model_output = ModelRunnerOutput(
req_id_to_index={requests[0].request_id: 0}, req_ids=[requests[0].request_id],
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], req_id_to_index={requests[0].request_id: 0},
logprobs=None, sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
prompt_logprobs_dict={}, spec_token_ids=None,
pooler_output=[]) logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
@@ -505,13 +576,23 @@ class TestAscendScheduler(TestBase):
512) 512)
# Model output of the first request. # Model output of the first request.
model_runner_output = ModelRunnerOutput( if vllm_version_is("0.10.1.1"):
req_ids=[requests[0].request_id], model_runner_output = ModelRunnerOutput(
req_id_to_index={requests[0].request_id: 0}, req_ids=[requests[0].request_id],
sampled_token_ids=[[0]], req_id_to_index={requests[0].request_id: 0},
logprobs=None, sampled_token_ids=[[0]],
prompt_logprobs_dict={}, spec_token_ids=None,
pooler_output=[]) logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output0, scheduler.update_from_output(scheduler_output0,
model_runner_output) model_runner_output)
@@ -521,13 +602,23 @@ class TestAscendScheduler(TestBase):
# request is still running. # request is still running.
scheduler.schedule() scheduler.schedule()
# Model output of the second request. # Model output of the second request.
model_runner_output = ModelRunnerOutput( if vllm_version_is("0.10.1.1"):
req_ids=[requests[1].request_id], model_runner_output = ModelRunnerOutput(
req_id_to_index={requests[1].request_id: 0}, req_ids=[requests[1].request_id],
sampled_token_ids=[[0]], req_id_to_index={requests[1].request_id: 0},
logprobs=None, sampled_token_ids=[[0]],
prompt_logprobs_dict={}, spec_token_ids=None,
pooler_output=[]) logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output1, scheduler.update_from_output(scheduler_output1,
model_runner_output) model_runner_output)
@@ -579,19 +670,29 @@ class TestAscendScheduler(TestBase):
req_id = requests[i].request_id req_id = requests[i].request_id
self.assertEqual(output.num_scheduled_tokens[req_id], 1) self.assertEqual(output.num_scheduled_tokens[req_id], 1)
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1"):
model_runner_output = ModelRunnerOutput( model_runner_output = ModelRunnerOutput(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_to_index, req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))], sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None, logprobs=None,
prompt_logprobs_dict={}, prompt_logprobs_dict={},
pooler_output=[]) spec_token_ids=spec_tokens,
draft_token_ids = DraftTokenIds(req_ids, spec_tokens) pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
engine_core_outputs = scheduler.update_from_output( engine_core_outputs = scheduler.update_from_output(
output, model_runner_output) output, model_runner_output)
scheduler.update_draft_token_ids(draft_token_ids) if not vllm_version_is("0.10.1.1"):
scheduler.update_draft_token_ids(draft_token_ids)
for i in range(len(requests)): for i in range(len(requests)):
running_req = scheduler.running[i] running_req = scheduler.running[i]
@@ -627,14 +728,23 @@ class TestAscendScheduler(TestBase):
else: else:
self.assertNotIn(req_id, self.assertNotIn(req_id,
output.scheduled_spec_decode_tokens) output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1"):
model_runner_output = ModelRunnerOutput( model_runner_output = ModelRunnerOutput(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_to_index, req_id_to_index=req_to_index,
sampled_token_ids=output_tokens, sampled_token_ids=output_tokens,
logprobs=None, spec_token_ids=None,
prompt_logprobs_dict={}, logprobs=None,
pooler_output=[]) prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
engine_core_outputs = scheduler.update_from_output( engine_core_outputs = scheduler.update_from_output(
output, model_runner_output) output, model_runner_output)

View File

@@ -200,12 +200,26 @@ def create_model_runner_output(
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending, kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
finished_recving=finished_recving) finished_recving=finished_recving)
extra_args = {"kv_connector_output": kv_connector_output} extra_args = {"kv_connector_output": kv_connector_output}
return ModelRunnerOutput( if vllm_version_is("0.10.1.1"):
req_ids=req_ids, model_runner_output = ModelRunnerOutput(
req_id_to_index=req_id_to_index, req_ids=req_ids,
sampled_token_ids=sampled_token_ids, req_id_to_index=req_id_to_index,
logprobs=None, sampled_token_ids=sampled_token_ids,
prompt_logprobs_dict={}, spec_token_ids=None,
pooler_output=[], logprobs=None,
**extra_args, prompt_logprobs_dict={},
) pooler_output=[],
**extra_args,
)
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
**extra_args,
)
return model_runner_output

View File

@@ -31,6 +31,13 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.1.1"):
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
else:
KVCacheBlocks = None
class AscendScheduler(Scheduler): class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler """This Scheduler extends vllm's original v1 scheduler
@@ -59,7 +66,10 @@ class AscendScheduler(Scheduler):
scheduled_running_reqs: list[Request] = [] scheduled_running_reqs: list[Request] = []
preempted_reqs: list[Request] = [] preempted_reqs: list[Request] = []
req_to_new_block_ids: dict[str, list[int]] = {} if vllm_version_is("0.10.1.1"):
req_to_new_block_ids: dict[str, list[int]] = {}
else:
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
num_scheduled_tokens: dict[str, int] = {} num_scheduled_tokens: dict[str, int] = {}
token_budget = self.max_num_scheduled_tokens token_budget = self.max_num_scheduled_tokens
# Spec decode-related. # Spec decode-related.
@@ -217,8 +227,11 @@ class AscendScheduler(Scheduler):
if self.lora_config and request.lora_request: if self.lora_config and request.lora_request:
scheduled_loras.add(request.lora_request.lora_int_id) scheduled_loras.add(request.lora_request.lora_int_id)
req_to_new_block_ids[request.request_id] = ( if vllm_version_is("0.10.1.1"):
self.kv_cache_manager.get_block_ids(request.request_id)) req_to_new_block_ids[request.request_id] = (
self.kv_cache_manager.get_block_ids(request.request_id))
else:
req_to_new_blocks[request.request_id] = new_blocks
# Update request info. # Update request info.
num_scheduled_tokens[request.request_id] = num_new_tokens num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens token_budget -= num_new_tokens
@@ -307,8 +320,11 @@ class AscendScheduler(Scheduler):
# Schedule the request. # Schedule the request.
scheduled_running_reqs.append(request) scheduled_running_reqs.append(request)
self.scheduled_req_ids.add(request.request_id) self.scheduled_req_ids.add(request.request_id)
req_to_new_block_ids[request.request_id] = ( if vllm_version_is("0.10.1.1"):
new_blocks.get_block_ids()) req_to_new_block_ids[request.request_id] = (
new_blocks.get_block_ids())
else:
req_to_new_blocks[request.request_id] = new_blocks
num_scheduled_tokens[request.request_id] = num_new_tokens num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens token_budget -= num_new_tokens
req_index += 1 req_index += 1
@@ -346,16 +362,27 @@ class AscendScheduler(Scheduler):
any_request, len(self.running))) any_request, len(self.running)))
# Construct the scheduler output. # Construct the scheduler output.
new_reqs_data = [ if vllm_version_is("0.10.1.1"):
NewRequestData.from_request(req, new_reqs_data = [
req_to_new_block_ids[req.request_id]) NewRequestData.from_request(
for req in scheduled_new_reqs req, req_to_new_block_ids[req.request_id])
] for req in scheduled_new_reqs
]
cached_reqs_data = self._make_cached_request_data(
scheduled_running_reqs, scheduled_resumed_reqs,
num_scheduled_tokens, scheduled_spec_decode_tokens,
req_to_new_block_ids)
else:
new_reqs_data = [
NewRequestData.from_request(
req, req_to_new_blocks[req.request_id].get_block_ids())
for req in scheduled_new_reqs
]
cached_reqs_data = self._make_cached_request_data( cached_reqs_data = self._make_cached_request_data(
scheduled_running_reqs, scheduled_resumed_reqs, scheduled_running_reqs, scheduled_resumed_reqs,
num_scheduled_tokens, scheduled_spec_decode_tokens, num_scheduled_tokens, scheduled_spec_decode_tokens,
req_to_new_block_ids) req_to_new_blocks)
scheduled_cached_reqs = cached_reqs_data scheduled_cached_reqs = cached_reqs_data
scheduler_output = SchedulerOutput( scheduler_output = SchedulerOutput(

View File

@@ -50,6 +50,7 @@ from vllm.sequence import IntermediateTensors
from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.ops.sequence_parallel import (MetadataForPadding, from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
init_metadata_for_sp) init_metadata_for_sp)
from vllm_ascend.utils import vllm_version_is
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -253,7 +254,11 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.num_redundant_experts = parallel_config.num_redundant_experts if vllm_version_is("0.10.1.1"):
self.num_redundant_experts = parallel_config.num_redundant_experts
else:
eplb_config = parallel_config.eplb_config
self.num_redundant_experts = eplb_config.num_redundant_experts
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.config = config self.config = config

View File

@@ -3,12 +3,19 @@ import torch_npu
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
from vllm.v1.sample.sampler import Sampler from vllm.v1.sample.sampler import Sampler
from vllm_ascend.utils import is_310p from vllm_ascend.utils import is_310p, vllm_version_is
if not vllm_version_is("0.10.1.1"):
from vllm.config import LogprobsMode
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
else:
LogprobsMode = None
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
class AscendSampler(Sampler): class AscendSampler(Sampler):
def __init__(self, logprobs_mode="raw_logprobs"): def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE):
# TODO: support logprobs_mode in vllm-ascend # TODO: support logprobs_mode in vllm-ascend
super().__init__(logprobs_mode=logprobs_mode) super().__init__(logprobs_mode=logprobs_mode)
self.topk_topp_sampler = AscendTopKTopPSampler() self.topk_topp_sampler = AscendTopKTopPSampler()
@@ -61,5 +68,19 @@ class AscendTopKTopPSampler(TopKTopPSampler):
def forward_native(self, logits, generators, k, p): def forward_native(self, logits, generators, k, p):
"""Override pytorch native implementation to torch_npu""" """Override pytorch native implementation to torch_npu"""
logits = self._apply_top_k_top_p(logits, k, p) logits = self._apply_top_k_top_p(logits, k, p)
if not vllm_version_is("0.10.1.1"):
logits_to_return = None
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
logits_to_return = logits
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
logits_to_return = logits.log_softmax(dim=-1,
dtype=torch.float32)
probs = logits.softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32)
return random_sample(probs, generators) output = None
if vllm_version_is("0.10.1.1"):
output = random_sample(probs, generators)
else:
output = (random_sample(probs, generators), logits_to_return)
return output

View File

@@ -64,8 +64,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec) KVCacheSpec)
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
LogprobsTensors, ModelRunnerOutput) ModelRunnerOutput)
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import build_logitsprocs from vllm.v1.sample.logits_processor import build_logitsprocs
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
@@ -95,11 +95,17 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
ProfileExecuteDuration, is_310p, ProfileExecuteDuration, is_310p,
maybe_converting_weight_acl_format) maybe_converting_weight_acl_format,
vllm_version_is)
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if not vllm_version_is("0.10.1.1"):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
if TYPE_CHECKING: if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped] import xgrammar as xgr # type: ignore[import-untyped]
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
@@ -514,11 +520,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Update the block IDs. # Update the block IDs.
if not resumed_from_preemption: if not resumed_from_preemption:
# Append the new blocks to the existing block IDs. if new_block_ids is not None:
for block_ids, new_ids in zip(req_state.block_ids, # Append the new blocks to the existing block IDs.
new_block_ids): for block_ids, new_ids in zip(req_state.block_ids,
block_ids.extend(new_ids) new_block_ids):
block_ids.extend(new_ids)
else: else:
assert new_block_ids is not None
# The request is resumed from preemption. # The request is resumed from preemption.
# Replace the existing block IDs with the new ones. # Replace the existing block IDs with the new ones.
req_state.block_ids = new_block_ids req_state.block_ids = new_block_ids
@@ -534,7 +542,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Update the persistent batch. # Update the persistent batch.
self.input_batch.num_computed_tokens_cpu[req_index] = ( self.input_batch.num_computed_tokens_cpu[req_index] = (
num_computed_tokens) num_computed_tokens)
self.input_batch.block_table.append_row(new_block_ids, req_index) if new_block_ids is not None:
self.input_batch.block_table.append_row(
new_block_ids, req_index)
# For the last rank, we don't need to update the token_ids_cpu # For the last rank, we don't need to update the token_ids_cpu
# because the sampled tokens are already cached. # because the sampled tokens are already cached.
@@ -1526,16 +1536,28 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else: else:
pooler_output.append(None) pooler_output.append(None)
extra_args = ({"kv_connector_output": kv_connector_output}) extra_args = ({"kv_connector_output": kv_connector_output})
if vllm_version_is("0.10.1.1"):
return ModelRunnerOutput( modelrunner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids, req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index, req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=[], sampled_token_ids=[],
logprobs=None, spec_token_ids=None,
prompt_logprobs_dict={}, logprobs=None,
pooler_output=pooler_output, prompt_logprobs_dict={},
**extra_args, pooler_output=pooler_output,
) **extra_args,
)
else:
modelrunner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=[],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=pooler_output,
**extra_args,
)
return modelrunner_output
@torch.inference_mode() @torch.inference_mode()
def execute_model( def execute_model(
@@ -1757,15 +1779,27 @@ class NPUModelRunner(LoRAModelRunnerMixin):
extra_args = ({"kv_connector_output": kv_connector_output}) extra_args = ({"kv_connector_output": kv_connector_output})
model_runner_output = ModelRunnerOutput( if vllm_version_is("0.10.1.1"):
req_ids=self.input_batch.req_ids, model_runner_output = ModelRunnerOutput(
req_id_to_index=self.input_batch.req_id_to_index, req_ids=self.input_batch.req_ids,
sampled_token_ids=valid_sampled_token_ids, req_id_to_index=self.input_batch.req_id_to_index,
logprobs=logprobs_lists, sampled_token_ids=valid_sampled_token_ids,
prompt_logprobs_dict=prompt_logprobs_dict, logprobs=logprobs_lists,
pooler_output=[], spec_token_ids=self._draft_token_ids,
**extra_args, prompt_logprobs_dict=prompt_logprobs_dict,
) pooler_output=[],
**extra_args,
)
else:
model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=valid_sampled_token_ids,
logprobs=logprobs_lists,
prompt_logprobs_dict=prompt_logprobs_dict,
pooler_output=[],
**extra_args,
)
durations = ProfileExecuteDuration().pop_captured_sync() durations = ProfileExecuteDuration().pop_captured_sync()
if durations: if durations: