[CI] Remove compatibility maintenance for vllm v0.10.1 and v0.10.1.1 (#2840)

### What this PR does / why we need it?
Remove compatibility maintenance for vllm v0.10.1 and v0.10.1.1

### Does this PR introduce _any_ user-facing change?
branch main of vllm-ascend will not be compatible with vllm v0.10.1 and
v0.10.1.1

### How was this patch tested?
CI passed with existing test.

- vLLM version: v0.10.1.1
- vLLM main:
6fb2788163

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-09-10 08:43:10 +08:00
committed by GitHub
parent 93e28e6862
commit edf1f600ad
22 changed files with 340 additions and 876 deletions

View File

@@ -112,7 +112,7 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: v0.10.1.1 ref: main
path: ./vllm-empty path: ./vllm-empty
- name: Install vllm-project/vllm from source - name: Install vllm-project/vllm from source

View File

@@ -51,7 +51,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: v0.10.1.1 - vllm_branch: main
vllm_ascend_branch: main vllm_ascend_branch: main
vllm_use_v1: 1 vllm_use_v1: 1
max-parallel: 1 max-parallel: 1

View File

@@ -43,7 +43,7 @@ jobs:
strategy: strategy:
matrix: matrix:
os: [linux-aarch64-a3-8] os: [linux-aarch64-a3-8]
vllm_version: [v0.10.1.1, main] vllm_version: [main]
name: vLLM Ascend test name: vLLM Ascend test
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:

View File

@@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [v0.10.1.1, main] vllm_version: [main]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -139,7 +139,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-1] os: [linux-aarch64-a2-1]
vllm_version: [v0.10.1.1, main] vllm_version: [main]
name: singlecard e2e test - light name: singlecard e2e test - light
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -204,7 +204,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-2] os: [linux-aarch64-a2-2]
vllm_version: [v0.10.1.1, main] vllm_version: [main]
name: multicard e2e test - light name: multicard e2e test - light
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:

View File

@@ -53,7 +53,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-310p-1, linux-aarch64-310p-4] os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
vllm_version: [v0.10.1.1, main] vllm_version: [main]
name: 310p e2e test name: 310p e2e test
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:

View File

@@ -74,7 +74,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-1] os: [linux-aarch64-a2-1]
vllm_version: [v0.10.1.1, main] vllm_version: [main]
name: singlecard e2e test - full name: singlecard e2e test - full
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -158,7 +158,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-2] os: [linux-aarch64-a2-2]
vllm_version: [v0.10.1.1, main] vllm_version: [main]
name: multicard e2e test - full name: multicard e2e test - full
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1 ARG VLLM_TAG=main
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1 ARG VLLM_TAG=main
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1 ARG VLLM_TAG=main
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1 ARG VLLM_TAG=main
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1 ARG VLLM_TAG=main
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1 ARG VLLM_TAG=main
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -77,7 +77,7 @@ myst_substitutions = {
# CANN image tag # CANN image tag
'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11", 'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11",
# vllm version in ci # vllm version in ci
'ci_vllm_version': 'v0.10.1.1', 'ci_vllm_version': 'main',
} }
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.

View File

@@ -13,18 +13,12 @@ from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec) KVCacheGroupSpec)
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B" MODEL = "Qwen3-0.6B"
@@ -54,25 +48,13 @@ def create_requests(
prompt_logprobs=prompt_logprobs) prompt_logprobs=prompt_logprobs)
requests = [] requests = []
for i in range(num_requests): for i in range(num_requests):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): request = Request(request_id=f"{i}",
request = Request(request_id=f"{i}", prompt_token_ids=[i] * num_tokens,
prompt_token_ids=[i] * num_tokens, sampling_params=sampling_params,
sampling_params=sampling_params, eos_token_id=EOS_TOKEN_ID,
multi_modal_kwargs=None, pooling_params=None,
multi_modal_placeholders=None, block_hasher=get_request_block_hasher(
multi_modal_hashes=None, block_size, hash_fn))
eos_token_id=EOS_TOKEN_ID,
pooling_params=None,
block_hasher=get_request_block_hasher(
block_size, hash_fn))
else:
request = Request(request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
eos_token_id=EOS_TOKEN_ID,
pooling_params=None,
block_hasher=get_request_block_hasher(
block_size, hash_fn))
requests.append(request) requests.append(request)
return requests return requests
@@ -85,25 +67,15 @@ def make_output(scheduler):
} }
sampled_token_ids = [[1000]] * len(scheduler.running) sampled_token_ids = [[1000]] * len(scheduler.running)
logprobs = None logprobs = None
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
modelrunner_output = ModelRunnerOutput( modelrunner_output = ModelRunnerOutput(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_id_to_index, req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids, sampled_token_ids=sampled_token_ids,
spec_token_ids=None, logprobs=logprobs,
logprobs=logprobs, prompt_logprobs_dict={},
prompt_logprobs_dict={}, pooler_output=[],
pooler_output=[], )
)
else:
modelrunner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=logprobs,
prompt_logprobs_dict={},
pooler_output=[],
)
return modelrunner_output return modelrunner_output
@@ -304,69 +276,34 @@ class TestAscendScheduler(TestBase):
scheduler.running.append(req) scheduler.running.append(req)
req.status = RequestStatus.RUNNING req.status = RequestStatus.RUNNING
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduler_output = SchedulerOutput( scheduled_cached_reqs=[],
scheduled_new_reqs=[], num_scheduled_tokens={
scheduled_cached_reqs=[], requests[0].request_id: 1,
num_scheduled_tokens={ requests[1].request_id: 2
requests[0].request_id: 1, },
requests[1].request_id: 2 total_num_scheduled_tokens=3,
}, scheduled_encoder_inputs={},
total_num_scheduled_tokens=3, scheduled_spec_decode_tokens={
scheduled_encoder_inputs={}, requests[0].request_id: [],
scheduled_spec_decode_tokens={ requests[1].request_id: [10]
requests[0].request_id: [], },
requests[1].request_id: [10] num_common_prefix_blocks=0,
}, finished_req_ids=set(),
num_common_prefix_blocks=0, free_encoder_mm_hashes=[],
finished_req_ids=set(), structured_output_request_ids={},
free_encoder_input_ids=[], grammar_bitmask=None)
structured_output_request_ids={}, model_output = ModelRunnerOutput(
grammar_bitmask=None) req_ids=[req.request_id for req in requests],
model_output = ModelRunnerOutput( req_id_to_index={
req_ids=[req.request_id for req in requests], req.request_id: i
req_id_to_index={ for i, req in enumerate(requests)
req.request_id: i },
for i, req in enumerate(requests) sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
}, ], # First request hits EOS, second continues
sampled_token_ids=[[EOS_TOKEN_ID], [ logprobs=None,
10, 11 prompt_logprobs_dict={},
]], # First request hits EOS, second continues pooler_output=[])
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 1,
requests[1].request_id: 2
},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [],
requests[1].request_id: [10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [
10, 11
]], # First request hits EOS, second continues
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
@@ -391,67 +328,35 @@ class TestAscendScheduler(TestBase):
scheduler.running.append(req) scheduler.running.append(req)
req.status = RequestStatus.RUNNING req.status = RequestStatus.RUNNING
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduler_output = SchedulerOutput( scheduled_cached_reqs=[],
scheduled_new_reqs=[], num_scheduled_tokens={
scheduled_cached_reqs=[], requests[0].request_id: 3,
num_scheduled_tokens={ requests[1].request_id: 2
requests[0].request_id: 3, },
requests[1].request_id: 2 total_num_scheduled_tokens=5,
}, scheduled_encoder_inputs={},
total_num_scheduled_tokens=5, scheduled_spec_decode_tokens={
scheduled_encoder_inputs={}, requests[0].request_id:
scheduled_spec_decode_tokens={ [10, 42],
requests[0].request_id: [10, 42], requests[1].request_id: [13]
requests[1].request_id: [13] },
}, num_common_prefix_blocks=0,
num_common_prefix_blocks=0, finished_req_ids=set(),
finished_req_ids=set(), free_encoder_mm_hashes=[],
free_encoder_input_ids=[], structured_output_request_ids={},
structured_output_request_ids={}, grammar_bitmask=None)
grammar_bitmask=None) model_output = ModelRunnerOutput(
model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests],
req_ids=[req.request_id for req in requests], req_id_to_index={
req_id_to_index={ req.request_id: i
req.request_id: i for i, req in enumerate(requests)
for i, req in enumerate(requests) },
}, sampled_token_ids=[[10, 42, 12],
sampled_token_ids=[[10, 42, 12], [13, 14]], # First request hits stop token
[13, 14]], # First request hits stop token logprobs=None,
spec_token_ids=None, prompt_logprobs_dict={},
logprobs=None, pooler_output=[])
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 2
},
total_num_scheduled_tokens=5,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 42],
requests[1].request_id: [13]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
@@ -475,67 +380,35 @@ class TestAscendScheduler(TestBase):
scheduler.running.append(req) scheduler.running.append(req)
req.status = RequestStatus.RUNNING req.status = RequestStatus.RUNNING
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduler_output = SchedulerOutput( scheduled_cached_reqs=[],
scheduled_new_reqs=[], num_scheduled_tokens={
scheduled_cached_reqs=[], requests[0].request_id: 3,
num_scheduled_tokens={ requests[1].request_id: 1
requests[0].request_id: 3, },
requests[1].request_id: 1 total_num_scheduled_tokens=4,
}, scheduled_encoder_inputs={},
total_num_scheduled_tokens=4, scheduled_spec_decode_tokens={
scheduled_encoder_inputs={}, requests[0].request_id:
scheduled_spec_decode_tokens={ [10, 11],
requests[0].request_id: [10, 11], requests[1].request_id: []
requests[1].request_id: [] },
}, num_common_prefix_blocks=0,
num_common_prefix_blocks=0, finished_req_ids=set(),
finished_req_ids=set(), free_encoder_mm_hashes=[],
free_encoder_input_ids=[], structured_output_request_ids={},
structured_output_request_ids={}, grammar_bitmask=None)
grammar_bitmask=None) model_output = ModelRunnerOutput(
model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests],
req_ids=[req.request_id for req in requests], req_id_to_index={
req_id_to_index={ req.request_id: i
req.request_id: i for i, req in enumerate(requests)
for i, req in enumerate(requests) },
}, sampled_token_ids=[[10, 11, 12],
sampled_token_ids=[[10, 11, 12], [13]], # First request exceeds max_tokens
[13]], # First request exceeds max_tokens logprobs=None,
spec_token_ids=None, prompt_logprobs_dict={},
logprobs=None, pooler_output=[])
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 1
},
total_num_scheduled_tokens=4,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 11],
requests[1].request_id: []
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
# Verify first request stopped due to length # Verify first request stopped due to length
@@ -556,52 +429,27 @@ class TestAscendScheduler(TestBase):
scheduler.requests[requests[0].request_id] = requests[0] scheduler.requests[requests[0].request_id] = requests[0]
scheduler.running.append(requests[0]) scheduler.running.append(requests[0])
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput(
scheduler_output = SchedulerOutput( scheduled_new_reqs=[],
scheduled_new_reqs=[], scheduled_cached_reqs=[],
scheduled_cached_reqs=[], num_scheduled_tokens={requests[0].request_id: 3},
num_scheduled_tokens={requests[0].request_id: 3}, total_num_scheduled_tokens=3,
total_num_scheduled_tokens=3, scheduled_encoder_inputs={},
scheduled_encoder_inputs={}, scheduled_spec_decode_tokens={
scheduled_spec_decode_tokens={ requests[0].request_id: [EOS_TOKEN_ID, 10]
requests[0].request_id: [EOS_TOKEN_ID, 10] },
}, num_common_prefix_blocks=0,
num_common_prefix_blocks=0, finished_req_ids=set(),
finished_req_ids=set(), free_encoder_mm_hashes=[],
free_encoder_input_ids=[], structured_output_request_ids={},
structured_output_request_ids={}, grammar_bitmask=None)
grammar_bitmask=None) model_output = ModelRunnerOutput(
model_output = ModelRunnerOutput( req_ids=[requests[0].request_id],
req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0},
req_id_to_index={requests[0].request_id: 0}, sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], logprobs=None,
spec_token_ids=None, prompt_logprobs_dict={},
logprobs=None, pooler_output=[])
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={requests[0].request_id: 3},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [EOS_TOKEN_ID, 10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output) scheduler.update_from_output(scheduler_output, model_output)
@@ -652,23 +500,13 @@ class TestAscendScheduler(TestBase):
512) 512)
# Model output of the first request. # Model output of the first request.
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput(
model_runner_output = ModelRunnerOutput( req_ids=[requests[0].request_id],
req_ids=[requests[0].request_id], req_id_to_index={requests[0].request_id: 0},
req_id_to_index={requests[0].request_id: 0}, sampled_token_ids=[[0]],
sampled_token_ids=[[0]], logprobs=None,
spec_token_ids=None, prompt_logprobs_dict={},
logprobs=None, pooler_output=[])
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output0, scheduler.update_from_output(scheduler_output0,
model_runner_output) model_runner_output)
@@ -678,23 +516,13 @@ class TestAscendScheduler(TestBase):
# request is still running. # request is still running.
scheduler.schedule() scheduler.schedule()
# Model output of the second request. # Model output of the second request.
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput(
model_runner_output = ModelRunnerOutput( req_ids=[requests[1].request_id],
req_ids=[requests[1].request_id], req_id_to_index={requests[1].request_id: 0},
req_id_to_index={requests[1].request_id: 0}, sampled_token_ids=[[0]],
sampled_token_ids=[[0]], logprobs=None,
spec_token_ids=None, prompt_logprobs_dict={},
logprobs=None, pooler_output=[])
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output1, scheduler.update_from_output(scheduler_output1,
model_runner_output) model_runner_output)
@@ -746,29 +574,19 @@ class TestAscendScheduler(TestBase):
req_id = requests[i].request_id req_id = requests[i].request_id
self.assertEqual(output.num_scheduled_tokens[req_id], 1) self.assertEqual(output.num_scheduled_tokens[req_id], 1)
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput( model_runner_output = ModelRunnerOutput(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_to_index, req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))], sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None, logprobs=None,
prompt_logprobs_dict={}, prompt_logprobs_dict={},
spec_token_ids=spec_tokens, pooler_output=[])
pooler_output=[]) draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
engine_core_outputs = scheduler.update_from_output( engine_core_outputs = scheduler.update_from_output(
output, model_runner_output) output, model_runner_output)
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): scheduler.update_draft_token_ids(draft_token_ids)
scheduler.update_draft_token_ids(draft_token_ids)
for i in range(len(requests)): for i in range(len(requests)):
running_req = scheduler.running[i] running_req = scheduler.running[i]
@@ -804,23 +622,14 @@ class TestAscendScheduler(TestBase):
else: else:
self.assertNotIn(req_id, self.assertNotIn(req_id,
output.scheduled_spec_decode_tokens) output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput( model_runner_output = ModelRunnerOutput(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_to_index, req_id_to_index=req_to_index,
sampled_token_ids=output_tokens, sampled_token_ids=output_tokens,
spec_token_ids=None, logprobs=None,
logprobs=None, prompt_logprobs_dict={},
prompt_logprobs_dict={}, pooler_output=[])
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
engine_core_outputs = scheduler.update_from_output( engine_core_outputs = scheduler.update_from_output(
output, model_runner_output) output, model_runner_output)

View File

@@ -19,8 +19,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
os.environ["VLLM_USE_V1"] = "1" os.environ["VLLM_USE_V1"] = "1"
@@ -160,27 +158,14 @@ def create_request(
else: else:
prompt_token_ids = [i * request_id for i in range(num_tokens)] prompt_token_ids = [i * request_id for i in range(num_tokens)]
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req = Request(
req = Request( request_id=f"id-{request_id}",
request_id=f"id-{request_id}", prompt_token_ids=prompt_token_ids,
prompt_token_ids=prompt_token_ids, sampling_params=sampling_params,
sampling_params=sampling_params, pooling_params=[],
multi_modal_kwargs=None, eos_token_id=EOS_TOKEN_ID,
multi_modal_placeholders=None, block_hasher=block_hasher,
multi_modal_hashes=None, )
pooling_params=[],
eos_token_id=EOS_TOKEN_ID,
block_hasher=block_hasher,
)
else:
req = Request(
request_id=f"id-{request_id}",
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
pooling_params=[],
eos_token_id=EOS_TOKEN_ID,
block_hasher=block_hasher,
)
req.kv_transfer_params = kv_transfer_params req.kv_transfer_params = kv_transfer_params
return req return req
@@ -208,26 +193,15 @@ def create_model_runner_output(
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending, kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
finished_recving=finished_recving) finished_recving=finished_recving)
extra_args = {"kv_connector_output": kv_connector_output} extra_args = {"kv_connector_output": kv_connector_output}
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput( model_runner_output = ModelRunnerOutput(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_id_to_index, req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids, sampled_token_ids=sampled_token_ids,
spec_token_ids=None, logprobs=None,
logprobs=None, prompt_logprobs_dict={},
prompt_logprobs_dict={}, pooler_output=[],
pooler_output=[], **extra_args,
**extra_args, )
)
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
**extra_args,
)
return model_runner_output return model_runner_output

View File

@@ -23,6 +23,7 @@ from vllm.distributed.kv_events import KVEventBatch
from vllm.logger import logger from vllm.logger import logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.utils import cdiv from vllm.utils import cdiv
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs
@@ -31,13 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
else:
KVCacheBlocks = None
class AscendScheduler(Scheduler): class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler """This Scheduler extends vllm's original v1 scheduler
@@ -66,10 +60,7 @@ class AscendScheduler(Scheduler):
scheduled_running_reqs: list[Request] = [] scheduled_running_reqs: list[Request] = []
preempted_reqs: list[Request] = [] preempted_reqs: list[Request] = []
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req_to_new_blocks: dict[str, KVCacheBlocks] = {}
req_to_new_block_ids: dict[str, list[list[int]]] = {}
else:
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
num_scheduled_tokens: dict[str, int] = {} num_scheduled_tokens: dict[str, int] = {}
token_budget = self.max_num_scheduled_tokens token_budget = self.max_num_scheduled_tokens
# Spec decode-related. # Spec decode-related.
@@ -227,13 +218,10 @@ class AscendScheduler(Scheduler):
if self.lora_config and request.lora_request: if self.lora_config and request.lora_request:
scheduled_loras.add(request.lora_request.lora_int_id) scheduled_loras.add(request.lora_request.lora_int_id)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
req_to_new_block_ids[request.request_id] = ( req_to_new_blocks[
self.kv_cache_manager.get_block_ids(request.request_id)) request.request_id] = self.kv_cache_manager.get_blocks(
else: request.request_id)
req_to_new_blocks[
request.request_id] = self.kv_cache_manager.get_blocks(
request.request_id)
# Update request info. # Update request info.
num_scheduled_tokens[request.request_id] = num_new_tokens num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens token_budget -= num_new_tokens
@@ -322,11 +310,7 @@ class AscendScheduler(Scheduler):
# Schedule the request. # Schedule the request.
scheduled_running_reqs.append(request) scheduled_running_reqs.append(request)
self.scheduled_req_ids.add(request.request_id) self.scheduled_req_ids.add(request.request_id)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): req_to_new_blocks[request.request_id] = new_blocks
req_to_new_block_ids[request.request_id] = (
new_blocks.get_block_ids())
else:
req_to_new_blocks[request.request_id] = new_blocks
num_scheduled_tokens[request.request_id] = num_new_tokens num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens token_budget -= num_new_tokens
req_index += 1 req_index += 1
@@ -365,67 +349,36 @@ class AscendScheduler(Scheduler):
any_request, len(self.running))) any_request, len(self.running)))
# Construct the scheduler output. # Construct the scheduler output.
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): new_reqs_data = [
new_reqs_data = [ NewRequestData.from_request(
NewRequestData.from_request( req, req_to_new_blocks[req.request_id].get_block_ids())
req, req_to_new_block_ids[req.request_id]) for req in scheduled_new_reqs
for req in scheduled_new_reqs ]
]
cached_reqs_data = self._make_cached_request_data(
scheduled_running_reqs, scheduled_resumed_reqs,
num_scheduled_tokens, scheduled_spec_decode_tokens,
req_to_new_block_ids)
else:
new_reqs_data = [
NewRequestData.from_request(
req, req_to_new_blocks[req.request_id].get_block_ids())
for req in scheduled_new_reqs
]
cached_reqs_data = self._make_cached_request_data( cached_reqs_data = self._make_cached_request_data(
scheduled_running_reqs, scheduled_resumed_reqs, scheduled_running_reqs, scheduled_resumed_reqs,
num_scheduled_tokens, scheduled_spec_decode_tokens, num_scheduled_tokens, scheduled_spec_decode_tokens,
req_to_new_blocks) req_to_new_blocks)
scheduled_cached_reqs = cached_reqs_data scheduled_cached_reqs = cached_reqs_data
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): scheduler_output = SchedulerOutput(
scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data,
scheduled_new_reqs=new_reqs_data, scheduled_cached_reqs=scheduled_cached_reqs,
scheduled_cached_reqs=scheduled_cached_reqs, num_scheduled_tokens=num_scheduled_tokens,
num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens,
total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
scheduled_spec_decode_tokens=scheduled_spec_decode_tokens, scheduled_encoder_inputs={},
scheduled_encoder_inputs={}, num_common_prefix_blocks=num_common_prefix_blocks,
num_common_prefix_blocks=num_common_prefix_blocks, # finished_req_ids is an existing state in the scheduler,
# finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step.
# instead of being newly scheduled in this step. # It contains the request IDs that are finished in between
# It contains the request IDs that are finished in between # the previous and the current steps.
# the previous and the current steps. finished_req_ids=self.finished_req_ids, # type: ignore
finished_req_ids=self.finished_req_ids, # type: ignore free_encoder_mm_hashes=self.encoder_cache_manager.
free_encoder_input_ids=self.encoder_cache_manager. get_freed_mm_hashes(),
get_freed_ids(), structured_output_request_ids={},
structured_output_request_ids={}, grammar_bitmask=None,
grammar_bitmask=None, )
)
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=new_reqs_data,
scheduled_cached_reqs=scheduled_cached_reqs,
num_scheduled_tokens=num_scheduled_tokens,
total_num_scheduled_tokens=total_num_scheduled_tokens,
scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
scheduled_encoder_inputs={},
num_common_prefix_blocks=num_common_prefix_blocks,
# finished_req_ids is an existing state in the scheduler,
# instead of being newly scheduled in this step.
# It contains the request IDs that are finished in between
# the previous and the current steps.
finished_req_ids=self.finished_req_ids, # type: ignore
free_encoder_mm_hashes=self.encoder_cache_manager.
get_freed_mm_hashes(),
structured_output_request_ids={},
grammar_bitmask=None,
)
# NOTE(Kuntai): this function is designed for multiple purposes: # NOTE(Kuntai): this function is designed for multiple purposes:
# 1. Plan the KV cache store # 1. Plan the KV cache store

View File

@@ -51,7 +51,6 @@ from vllm.sequence import IntermediateTensors
from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.ops.sequence_parallel import (MetadataForPadding, from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
init_metadata_for_sp) init_metadata_for_sp)
from vllm_ascend.utils import vllm_version_is
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -255,11 +254,8 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): eplb_config = parallel_config.eplb_config
self.num_redundant_experts = parallel_config.num_redundant_experts self.num_redundant_experts = eplb_config.num_redundant_experts
else:
eplb_config = parallel_config.eplb_config
self.num_redundant_experts = eplb_config.num_redundant_experts
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.config = config self.config = config

View File

@@ -34,7 +34,7 @@ from vllm_ascend.ops.moe.experts_selector import select_experts
from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl, from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
AlltoAllCommImpl, MC2CommImpl) AlltoAllCommImpl, MC2CommImpl)
from vllm_ascend.ops.moe.token_dispatcher import setup_token_dispatchers from vllm_ascend.ops.moe.token_dispatcher import setup_token_dispatchers
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, vllm_version_is from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__ original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
@@ -137,67 +137,6 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs):
self.transpose = True self.transpose = True
def forward_oot_v01011(
self,
layer: torch.nn.Module,
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
router_logits: torch.Tensor,
renormalize: bool,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None,
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
activation: str = "silu",
enable_eplb: bool = False,
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor:
topk_weights, topk_ids, row_idx = select_experts(
hidden_states=x,
router_logits=router_logits,
top_k=top_k,
use_grouped_topk=use_grouped_topk,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=1.0,
e_score_correction_bias=e_score_correction_bias,
global_num_experts=global_num_experts)
if topk_ids.shape[1] < top_k or is_310p():
assert global_num_experts is not None
return fused_experts_moge(
hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
moe_parallel_config=self.moe.moe_parallel_config,
topk_weights=topk_weights,
topk_ids=topk_ids,
top_k=top_k,
global_num_experts=global_num_experts,
expert_map=expert_map,
apply_router_weight_on_input=apply_router_weight_on_input)
moe_comm_method = get_forward_context().moe_comm_method
return moe_comm_method.fused_experts(hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
row_idx=row_idx,
global_num_experts=global_num_experts,
expert_map=expert_map)
def forward_oot( def forward_oot(
self, self,
layer: torch.nn.Module, layer: torch.nn.Module,
@@ -315,59 +254,32 @@ class AscendFusedMoE(FusedMoE):
num_redundant_experts=0, num_redundant_experts=0,
has_bias=False, has_bias=False,
): ):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): super().__init__(
super().__init__( num_experts,
num_experts, top_k,
top_k, hidden_size,
hidden_size, intermediate_size,
intermediate_size, params_dtype,
params_dtype, reduce_results,
reduce_results, renormalize,
renormalize, use_grouped_topk,
use_grouped_topk, num_expert_group,
num_expert_group, topk_group,
topk_group, quant_config,
quant_config, tp_size,
tp_size, ep_size,
ep_size, dp_size,
dp_size, prefix,
prefix, custom_routing_function,
custom_routing_function, scoring_func,
scoring_func, routed_scaling_fator,
e_score_correction_bias, e_score_correction_bias,
apply_router_weight_on_input, apply_router_weight_on_input,
activation, activation,
enable_eplb, enable_eplb,
num_redundant_experts, num_redundant_experts,
has_bias, has_bias,
) )
else:
super().__init__(
num_experts,
top_k,
hidden_size,
intermediate_size,
params_dtype,
reduce_results,
renormalize,
use_grouped_topk,
num_expert_group,
topk_group,
quant_config,
tp_size,
ep_size,
dp_size,
prefix,
custom_routing_function,
scoring_func,
routed_scaling_fator,
e_score_correction_bias,
apply_router_weight_on_input,
activation,
enable_eplb,
num_redundant_experts,
has_bias,
)
setup_token_dispatchers(self.moe_config.ep_size, setup_token_dispatchers(self.moe_config.ep_size,
top_k=self.top_k, top_k=self.top_k,
num_experts=self.global_num_experts, num_experts=self.global_num_experts,
@@ -529,8 +441,4 @@ class AscendSharedFusedMoE(AscendFusedMoE):
UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func
UnquantizedFusedMoEMethod.process_weights_after_loading = process_weights_after_loading UnquantizedFusedMoEMethod.process_weights_after_loading = process_weights_after_loading
UnquantizedFusedMoEMethod.forward_oot = forward_oot
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
UnquantizedFusedMoEMethod.forward_oot = forward_oot_v01011
else:
UnquantizedFusedMoEMethod.forward_oot = forward_oot

View File

@@ -1,16 +1,12 @@
import torch import torch
import torch_npu import torch_npu
from vllm.config import LogprobsMode
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
from vllm.v1.sample.sampler import Sampler from vllm.v1.sample.sampler import Sampler
from vllm_ascend.utils import is_310p, vllm_version_is from vllm_ascend.utils import is_310p
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
from vllm.config import LogprobsMode
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
else:
LogprobsMode = None
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
class AscendSampler(Sampler): class AscendSampler(Sampler):
@@ -68,19 +64,11 @@ class AscendTopKTopPSampler(TopKTopPSampler):
def forward_native(self, logits, generators, k, p): def forward_native(self, logits, generators, k, p):
"""Override pytorch native implementation to torch_npu""" """Override pytorch native implementation to torch_npu"""
logits = self._apply_top_k_top_p(logits, k, p) logits = self._apply_top_k_top_p(logits, k, p)
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): logits_to_return = None
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
logits_to_return = None logits_to_return = logits
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
logits_to_return = logits logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
logits_to_return = logits.log_softmax(dim=-1,
dtype=torch.float32)
probs = logits.softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32)
output = None return random_sample(probs, generators), logits_to_return
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
output = random_sample(probs, generators)
else:
output = (random_sample(probs, generators), logits_to_return)
return output

View File

@@ -63,8 +63,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec) KVCacheSpec)
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
ModelRunnerOutput) LogprobsTensors, ModelRunnerOutput)
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import build_logitsprocs from vllm.v1.sample.logits_processor import build_logitsprocs
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
@@ -96,14 +96,9 @@ from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
AscendSocVersion, ProfileExecuteDuration, AscendSocVersion, ProfileExecuteDuration,
get_ascend_soc_version, is_310p, get_ascend_soc_version, is_310p,
lmhead_tp_enable, vllm_version_is) lmhead_tp_enable)
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
if TYPE_CHECKING: if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped] import xgrammar as xgr # type: ignore[import-untyped]
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
@@ -195,9 +190,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Lazy initialization, these will be set after __init__ # Lazy initialization, these will be set after __init__
self.kv_caches: List[torch.Tensor] = [] self.kv_caches: List[torch.Tensor] = []
# TODO: remove Dict[str, Dict[int, torch.Tensor]] type after 0.10.1.1 self.encoder_cache: Dict[str, torch.Tensor] = {}
self.encoder_cache: Union[Dict[str, Dict[int, torch.Tensor]],
Dict[str, torch.Tensor]] = {}
self.attn_mask = None self.attn_mask = None
self.attn_state = None self.attn_state = None
self.requests: Dict[str, CachedRequestState] = {} self.requests: Dict[str, CachedRequestState] = {}
@@ -369,8 +362,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Remove finished requests from the cached states. # Remove finished requests from the cached states.
for req_id in scheduler_output.finished_req_ids: for req_id in scheduler_output.finished_req_ids:
self.requests.pop(req_id, None) self.requests.pop(req_id, None)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
self.encoder_cache.pop(req_id, None)
# Remove the finished requests from the persistent batch. # Remove the finished requests from the persistent batch.
# NOTE(woosuk): There could be an edge case where finished_req_ids and # NOTE(woosuk): There could be an edge case where finished_req_ids and
# scheduled_req_ids overlap. This happens when a request is aborted and # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -379,17 +371,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# and handling the second as a new request. # and handling the second as a new request.
for req_id in scheduler_output.finished_req_ids: for req_id in scheduler_output.finished_req_ids:
self.input_batch.remove_request(req_id) self.input_batch.remove_request(req_id)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): for mm_hash in scheduler_output.free_encoder_mm_hashes:
# Free the cached encoder outputs. self.encoder_cache.pop(mm_hash, None)
for req_id, input_id in scheduler_output.free_encoder_input_ids:
encoder_outputs = self.encoder_cache.get(req_id)
if encoder_outputs is not None:
encoder_outputs.pop(input_id, None)
if not encoder_outputs:
self.encoder_cache.pop(req_id, None)
else:
for mm_hash in scheduler_output.free_encoder_mm_hashes:
self.encoder_cache.pop(mm_hash, None)
# Remove the unscheduled requests from the persistent batch. # Remove the unscheduled requests from the persistent batch.
# NOTE(woosuk): The unscheduled requests are either preempted requests # NOTE(woosuk): The unscheduled requests are either preempted requests
# or running requests that are not scheduled in this step. We remove # or running requests that are not scheduled in this step. We remove
@@ -438,12 +421,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_computed_tokens=new_req_data.num_computed_tokens, num_computed_tokens=new_req_data.num_computed_tokens,
output_token_ids=[], output_token_ids=[],
lora_request=new_req_data.lora_request, lora_request=new_req_data.lora_request,
**({ mm_hashes=new_req_data.mm_hashes,
"mm_hashes": new_req_data.mm_hashes
} if not (vllm_version_is("0.10.1.1")
or vllm_version_is("0.10.1")) else {
"mm_hashes": None
}),
) )
# Only relevant for models using M-RoPE (e.g, Qwen2-VL) # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -750,25 +728,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Batch the multi-modal inputs. # Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]() mm_kwargs = list[MultiModalKwargsItem]()
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
else:
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id] req_state = self.requests[req_id]
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): for mm_input_id in encoder_input_ids:
for mm_input_id in encoder_input_ids: mm_hash = req_state.mm_hashes[mm_input_id]
mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
req_ids_pos.append((req_id, mm_input_id, mm_hashes_pos.append(
req_state.mm_positions[mm_input_id])) (mm_hash, req_state.mm_positions[mm_input_id]))
else:
for mm_input_id in encoder_input_ids:
# TODO remove this assert after 0.10.1.1
assert req_state.mm_hashes is not None
mm_hash = req_state.mm_hashes[mm_input_id]
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
mm_hashes_pos.append(
(mm_hash, req_state.mm_positions[mm_input_id]))
# Batch mm inputs as much as we can: if a request in the batch has # Batch mm inputs as much as we can: if a request in the batch has
# multiple modalities or a different modality than the previous one, # multiple modalities or a different modality than the previous one,
# we process it separately to preserve item order. # we process it separately to preserve item order.
@@ -799,26 +766,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for output in curr_group_outputs: for output in curr_group_outputs:
encoder_outputs.append(output) encoder_outputs.append(output)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
# Cache the encoder outputs.
for (req_id, input_id, pos_info), output in zip(
req_ids_pos,
encoder_outputs,
):
if req_id not in self.encoder_cache:
self.encoder_cache[req_id] = {}
self.encoder_cache[req_id][input_id] = scatter_mm_placeholders( for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
output, self.encoder_cache[mm_hash] = scatter_mm_placeholders(
is_embed=pos_info.is_embed, output,
) is_embed=pos_info.is_embed,
else: )
for (mm_hash, pos_info), output in zip(mm_hashes_pos,
encoder_outputs):
self.encoder_cache[mm_hash] = scatter_mm_placeholders(
output,
is_embed=pos_info.is_embed,
)
def _gather_mm_embeddings( def _gather_mm_embeddings(
self, self,
@@ -831,8 +784,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
req_state = self.requests[req_id] req_state = self.requests[req_id]
num_computed_tokens = req_state.num_computed_tokens num_computed_tokens = req_state.num_computed_tokens
mm_positions = req_state.mm_positions mm_positions = req_state.mm_positions
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): mm_hashes = req_state.mm_hashes
mm_hashes = req_state.mm_hashes
for i, pos_info in enumerate(mm_positions): for i, pos_info in enumerate(mm_positions):
start_pos = pos_info.offset start_pos = pos_info.offset
num_encoder_tokens = pos_info.length num_encoder_tokens = pos_info.length
@@ -850,26 +802,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
continue continue
start_idx = max(num_computed_tokens - start_pos, 0) start_idx = max(num_computed_tokens - start_pos, 0)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): end_idx = min(
end_idx = min( num_computed_tokens - start_pos + num_scheduled_tokens,
num_computed_tokens - start_pos + num_scheduled_tokens, num_encoder_tokens,
num_encoder_tokens) )
assert start_idx < end_idx assert start_idx < end_idx
assert req_id in self.encoder_cache mm_hash = mm_hashes[i]
assert i in self.encoder_cache[req_id] encoder_output = self.encoder_cache.get(mm_hash, None)
encoder_output = self.encoder_cache[req_id][i] assert encoder_output is not None,\
else: f"Encoder cache miss for {mm_hash}."
end_idx = min(
num_computed_tokens - start_pos + num_scheduled_tokens,
num_encoder_tokens,
)
assert start_idx < end_idx
# TODO remove this assert after 0.10.1.1
assert mm_hashes is not None
mm_hash = mm_hashes[i]
encoder_output = self.encoder_cache.get(mm_hash, None)
assert encoder_output is not None,\
f"Encoder cache miss for {mm_hash}."
if (is_embed := pos_info.is_embed) is not None: if (is_embed := pos_info.is_embed) is not None:
is_embed = is_embed[start_idx:end_idx] is_embed = is_embed[start_idx:end_idx]
@@ -1389,52 +1330,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
hidden_states, attn_metadata, aux_hidden_states) hidden_states, attn_metadata, aux_hidden_states)
return draft_token_ids return draft_token_ids
def _pool_v010(
self,
hidden_states: torch.Tensor,
num_scheduled_tokens: int,
num_scheduled_tokens_np: np.ndarray,
finished_sending: Optional[set[str]] = None,
finished_recving: Optional[set[str]] = None,
kv_connector_output: Optional["KVConnectorOutput"] = None,
) -> ModelRunnerOutput:
assert self.input_batch.num_reqs ==\
len(self.input_batch.pooling_params), \
"Either all or none of the requests in" \
" a batch must be pooling request"
extracted_hidden_states = list(
torch.split(hidden_states[:num_scheduled_tokens],
num_scheduled_tokens_np.tolist()))
pooling_metadata = self.input_batch.pooling_metadata
raw_pooler_output = self.model.pooler(
hidden_states=extracted_hidden_states,
pooling_metadata=pooling_metadata)
pooler_output: list[Optional[torch.Tensor]] = []
seq_lens = self.seq_lens[:self.input_batch.num_reqs]
for raw_output, seq_len, prompt_len in zip(
raw_pooler_output, seq_lens, pooling_metadata.prompt_lens):
if seq_len == prompt_len:
pooler_output.append(raw_output.data.cpu())
else:
pooler_output.append(None)
extra_args = ({"kv_connector_output": kv_connector_output})
modelrunner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=[],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=pooler_output,
**extra_args,
)
return modelrunner_output
def _pool( def _pool(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
@@ -1606,19 +1501,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
logits = None logits = None
else: else:
if self.input_batch.pooling_params: if self.input_batch.pooling_params:
if vllm_version_is("0.10.1.1") or vllm_version_is( return self._pool(
"0.10.1"): hidden_states,
return self._pool_v010( scheduler_output.total_num_scheduled_tokens,
hidden_states, num_scheduled_tokens_np, finished_sending,
scheduler_output.total_num_scheduled_tokens, finished_recving, kv_connector_output)
num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output)
else:
return self._pool(
hidden_states,
scheduler_output.total_num_scheduled_tokens,
num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output)
sample_hidden_states = hidden_states[logits_indices] sample_hidden_states = hidden_states[logits_indices]
logits = self.model.compute_logits(sample_hidden_states, None) logits = self.model.compute_logits(sample_hidden_states, None)
if broadcast_pp_output: if broadcast_pp_output:
@@ -1759,27 +1646,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
extra_args = ({"kv_connector_output": kv_connector_output}) extra_args = ({"kv_connector_output": kv_connector_output})
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): model_runner_output = ModelRunnerOutput(
model_runner_output = ModelRunnerOutput( req_ids=self.input_batch.req_ids,
req_ids=self.input_batch.req_ids, req_id_to_index=self.input_batch.req_id_to_index,
req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=valid_sampled_token_ids,
sampled_token_ids=valid_sampled_token_ids, logprobs=logprobs_lists,
logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict,
spec_token_ids=self._draft_token_ids, pooler_output=[],
prompt_logprobs_dict=prompt_logprobs_dict, **extra_args,
pooler_output=[], )
**extra_args,
)
else:
model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=valid_sampled_token_ids,
logprobs=logprobs_lists,
prompt_logprobs_dict=prompt_logprobs_dict,
pooler_output=[],
**extra_args,
)
durations = ProfileExecuteDuration().pop_captured_sync() durations = ProfileExecuteDuration().pop_captured_sync()
if durations: if durations:
@@ -2079,8 +1954,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
assert sum(num_scheduled_tokens_list) == num_tokens assert sum(num_scheduled_tokens_list) == num_tokens
assert len(num_scheduled_tokens_list) == num_reqs assert len(num_scheduled_tokens_list) == num_reqs
hidden_states_list = list(
torch.split(hidden_states, num_scheduled_tokens_list))
req_num_tokens = num_tokens // num_reqs req_num_tokens = num_tokens // num_reqs
dummy_token_ids = torch.zeros((num_reqs, req_num_tokens), dummy_token_ids = torch.zeros((num_reqs, req_num_tokens),
@@ -2091,55 +1964,32 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dummy_pooling_params = PoolingParams(task=task) dummy_pooling_params = PoolingParams(task=task)
to_update = model.pooler.get_pooling_updates(task) to_update = model.pooler.get_pooling_updates(task)
to_update.apply(dummy_pooling_params) to_update.apply(dummy_pooling_params)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
dummy_prompt_lens = torch.tensor(
[h.shape[0] for h in hidden_states_list],
device=self.device,
)
dummy_metadata = PoolingMetadata(
prompt_lens=dummy_prompt_lens,
prompt_token_ids=dummy_token_ids,
pooling_params=[dummy_pooling_params] * num_reqs,
)
try: dummy_prompt_lens = torch.tensor(
return model.pooler(hidden_states=hidden_states_list, num_scheduled_tokens_list,
pooling_metadata=dummy_metadata) device="cpu",
except RuntimeError as e: )
if 'out of memory' in str(e): dummy_metadata = PoolingMetadata(
raise RuntimeError( prompt_lens=dummy_prompt_lens,
"NPU out of memory occurred when warming up pooler " prompt_token_ids=dummy_token_ids,
f"({task=}) with {num_reqs} dummy requests. Please try " pooling_params=[dummy_pooling_params] * num_reqs,
"lowering `max_num_seqs` or `gpu_memory_utilization` when " )
"initializing the engine.") from e
else:
raise e
else:
dummy_prompt_lens = torch.tensor(
num_scheduled_tokens_list,
device="cpu",
)
dummy_metadata = PoolingMetadata(
prompt_lens=dummy_prompt_lens,
prompt_token_ids=dummy_token_ids,
pooling_params=[dummy_pooling_params] * num_reqs,
)
dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list, dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list,
device=hidden_states.device) device=hidden_states.device)
try: try:
return model.pooler(hidden_states=hidden_states, return model.pooler(hidden_states=hidden_states,
pooling_metadata=dummy_metadata) pooling_metadata=dummy_metadata)
except RuntimeError as e: except RuntimeError as e:
if 'out of memory' in str(e): if 'out of memory' in str(e):
raise RuntimeError( raise RuntimeError(
"CUDA out of memory occurred when warming up pooler " "CUDA out of memory occurred when warming up pooler "
f"({task=}) with {num_reqs} dummy requests. Please try " f"({task=}) with {num_reqs} dummy requests. Please try "
"lowering `max_num_seqs` or `gpu_memory_utilization` when " "lowering `max_num_seqs` or `gpu_memory_utilization` when "
"initializing the engine.") from e "initializing the engine.") from e
else: else:
raise e raise e
@torch.inference_mode() @torch.inference_mode()
def _dummy_pooler_run( def _dummy_pooler_run(

View File

@@ -39,8 +39,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
from vllm.v1.utils import copy_slice from vllm.v1.utils import copy_slice
from vllm.v1.worker.block_table import MultiGroupBlockTable from vllm.v1.worker.block_table import MultiGroupBlockTable
from vllm_ascend.utils import vllm_version_is
@dataclass @dataclass
class CachedRequestState: class CachedRequestState:
@@ -49,8 +47,7 @@ class CachedRequestState:
prompt_token_ids: list[int] prompt_token_ids: list[int]
mm_kwargs: list[MultiModalKwargsItem] mm_kwargs: list[MultiModalKwargsItem]
mm_positions: list[PlaceholderRange] mm_positions: list[PlaceholderRange]
# TODO: remove Optional after 0.10.1.1 mm_hashes: list[str]
mm_hashes: Optional[list[str]]
sampling_params: Optional[SamplingParams] sampling_params: Optional[SamplingParams]
pooling_params: Optional[PoolingParams] pooling_params: Optional[PoolingParams]
generator: Optional[torch.Generator] generator: Optional[torch.Generator]
@@ -726,20 +723,13 @@ class InputBatch:
pooling_params = [ pooling_params = [
self.pooling_params[req_id] for req_id in self.req_ids self.pooling_params[req_id] for req_id in self.req_ids
] ]
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
return PoolingMetadata( return PoolingMetadata(
prompt_lens=torch.from_numpy( prompt_lens=torch.from_numpy(
self.num_prompt_tokens[:self.num_reqs]).to(self.device), self.num_prompt_tokens[:self.num_reqs]),
prompt_token_ids=self.sampling_metadata.prompt_token_ids, prompt_token_ids=self.sampling_metadata.prompt_token_ids,
pooling_params=pooling_params, pooling_params=pooling_params,
) )
else:
return PoolingMetadata(
prompt_lens=torch.from_numpy(
self.num_prompt_tokens[:self.num_reqs]),
prompt_token_ids=self.sampling_metadata.prompt_token_ids,
pooling_params=pooling_params,
)
def _make_prompt_token_ids_tensor(self) -> torch.Tensor: def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max() max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()

View File

@@ -38,7 +38,8 @@ from vllm.tasks import SupportedTask
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
ModelRunnerOutput)
from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.worker_base import WorkerBase
from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.ascend_config import init_ascend_config
@@ -47,14 +48,9 @@ from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
from vllm_ascend.platform import NPUPlatform from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (init_ascend_soc_version, from vllm_ascend.utils import (init_ascend_soc_version,
register_ascend_customop, sleep_mode_enabled, register_ascend_customop, sleep_mode_enabled,
try_register_lib, vllm_version_is) try_register_lib)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
class NPUWorker(WorkerBase): class NPUWorker(WorkerBase):