[CI] Upgrade vLLM to 20250919 (6d8246aa) and fix some broken issue (#2907)

### What this PR does / why we need it?
1. This pr bump vllm commit to
6d8246aaff
2. fix upstream changes https://github.com/vllm-project/vllm/pull/24548
abort multi-modal kwargs, make vllm main and `v0.10.2` both adaptable
3. fix metadata_builder changes introduced by
https://github.com/vllm-project/vllm/pull/23693
4. fix `structured_outputs_config` changes introduced by
https://github.com/vllm-project/vllm/pull/22772
5. fix `moe_config` changes introduced by
https://github.com/vllm-project/vllm/pull/22537

Co-authored-by:  MengqingCao <cmq0113@163.com>
Co-authored-by:  Yikun Jiang <yikunkero@gmail.com>


- vLLM version: v0.10.2
- vLLM main:
c60e6137f0

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Li Wang
2025-09-20 17:37:57 +08:00
committed by GitHub
parent 53ecd89e8f
commit 12bcbd02bb
14 changed files with 359 additions and 143 deletions

View File

@@ -33,16 +33,10 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: ./vllm-empty
- name: Get vLLM version - name: Get vLLM version
working-directory: ./vllm-empty
run: | run: |
VLLM_COMMIT=$(git rev-parse HEAD) VLLM_COMMIT=6d8246aaffff3ebec84767e373212a7b8da328e2
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -82,7 +82,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [v0.10.2] vllm_version: [6d8246aaffff3ebec84767e373212a7b8da328e2, v0.10.2]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -118,10 +118,12 @@ jobs:
TORCH_DEVICE_BACKEND_AUTOLOAD: 0 TORCH_DEVICE_BACKEND_AUTOLOAD: 0
run: | run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut --ignore=tests/ut/test_platform.py pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut \
--ignore=tests/ut/test_platform.py \
--ignore=tests/ut/patch/worker/patch_common/test_patch_minicpm.py
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
if: ${{ matrix.vllm_version == 'main' }} if: ${{ matrix.vllm_version != 'v0.10.2' }}
uses: codecov/codecov-action@v5 uses: codecov/codecov-action@v5
env: env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -138,7 +140,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-1] os: [linux-aarch64-a2-1]
vllm_version: [v0.10.2] vllm_version: [6d8246aaffff3ebec84767e373212a7b8da328e2, v0.10.2]
name: singlecard e2e test - light name: singlecard e2e test - light
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -174,6 +176,7 @@ jobs:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }} ref: ${{ matrix.vllm_version }}
path: ./vllm-empty path: ./vllm-empty
fetch-depth: 1
- name: Install vllm-project/vllm from source - name: Install vllm-project/vllm from source
working-directory: ./vllm-empty working-directory: ./vllm-empty
@@ -203,7 +206,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-2] os: [linux-aarch64-a2-2]
vllm_version: [v0.10.2] vllm_version: [6d8246aaffff3ebec84767e373212a7b8da328e2, v0.10.2]
name: multicard e2e test - light name: multicard e2e test - light
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -239,6 +242,7 @@ jobs:
repository: vllm-project/vllm repository: vllm-project/vllm
ref: ${{ matrix.vllm_version }} ref: ${{ matrix.vllm_version }}
path: ./vllm-empty path: ./vllm-empty
fetch-depth: 1
- name: Install vllm-project/vllm from source - name: Install vllm-project/vllm from source
working-directory: ./vllm-empty working-directory: ./vllm-empty

View File

@@ -72,7 +72,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-1] os: [linux-aarch64-a2-1]
vllm_version: [v0.10.2] vllm_version: [6d8246aaffff3ebec84767e373212a7b8da328e2, v0.10.2]
name: singlecard e2e test - full name: singlecard e2e test - full
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -156,7 +156,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-aarch64-a2-2] os: [linux-aarch64-a2-2]
vllm_version: [v0.10.2] vllm_version: [6d8246aaffff3ebec84767e373212a7b8da328e2, v0.10.2]
name: multicard e2e test - full name: multicard e2e test - full
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:
@@ -210,7 +210,7 @@ jobs:
VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
run: | run: |
pytest -sv tests/e2e/multicard/test_data_parallel.py #pytest -sv tests/e2e/multicard/test_data_parallel.py
pytest -sv tests/e2e/multicard/test_expert_parallel.py pytest -sv tests/e2e/multicard/test_expert_parallel.py
# external_launcher test is not stable enough. Fix it later # external_launcher test is not stable enough. Fix it later
# pytest -sv tests/e2e/multicard/test_external_launcher.py # pytest -sv tests/e2e/multicard/test_external_launcher.py

View File

@@ -18,12 +18,20 @@
# #
import json import json
import os import os
from typing import Any, Dict
import jsonschema import jsonschema
import pytest import pytest
import regex as re import regex as re
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
else:
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
@@ -84,16 +92,29 @@ def sample_json_schema():
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_json_completion(guided_decoding_backend: str, def test_guided_json_completion(guided_decoding_backend: str,
sample_json_schema): sample_json_schema):
runner_kwargs: Dict[str, Any] = {}
if vllm_version_is("0.10.2"):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=1.0, temperature=1.0,
max_tokens=500, max_tokens=500,
guided_decoding=GuidedDecodingParams(json=sample_json_schema)) guided_decoding=GuidedDecodingParams(json=sample_json_schema))
runner_kwargs = {
with VllmRunner( "seed": 0,
MODEL_NAME, "guided_decoding_backend": guided_decoding_backend,
seed=0, }
guided_decoding_backend=guided_decoding_backend, else:
) as vllm_model: sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
structured_outputs=StructuredOutputsParams(
json=sample_json_schema))
runner_kwargs = {
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
}
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [ prompts = [
f"Give an example JSON for an employee profile " f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}" f"that fits this schema: {sample_json_schema}"
@@ -121,17 +142,29 @@ def test_guided_json_completion(guided_decoding_backend: str,
def test_guided_regex(guided_decoding_backend: str, sample_regex): def test_guided_regex(guided_decoding_backend: str, sample_regex):
if guided_decoding_backend == "outlines": if guided_decoding_backend == "outlines":
pytest.skip("Outlines doesn't support regex-based guided decoding.") pytest.skip("Outlines doesn't support regex-based guided decoding.")
runner_kwargs: Dict[str, Any] = {}
if vllm_version_is("0.10.2"):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex)) guided_decoding=GuidedDecodingParams(regex=sample_regex))
runner_kwargs = {
"seed": 0,
"guided_decoding_backend": guided_decoding_backend,
}
else:
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
structured_outputs=StructuredOutputsParams(regex=sample_regex))
runner_kwargs = {
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
}
with VllmRunner( with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
MODEL_NAME,
seed=0,
guided_decoding_backend=guided_decoding_backend,
) as vllm_model:
prompts = [ prompts = [
f"Give an example IPv4 address with this regex: {sample_regex}" f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2 ] * 2

View File

@@ -231,6 +231,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
expert_weights: torch.Tensor) -> torch.Tensor: expert_weights: torch.Tensor) -> torch.Tensor:
pass pass
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
pass
class TestAscendFusedMoe: class TestAscendFusedMoe:

View File

@@ -197,6 +197,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
expert_weights: torch.Tensor) -> torch.Tensor: expert_weights: torch.Tensor) -> torch.Tensor:
pass pass
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
pass
class TestTorchairAscendFusedMoe: class TestTorchairAscendFusedMoe:

View File

@@ -47,7 +47,8 @@ from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
from vllm_ascend.ops.sequence_parallel import MetadataForPadding from vllm_ascend.ops.sequence_parallel import MetadataForPadding
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
get_all_reduce_merge_state, get_all_reduce_merge_state,
get_rm_router_logits_state, is_310p) get_rm_router_logits_state, is_310p,
vllm_version_is)
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@@ -278,6 +279,7 @@ class AscendFusedMoE(FusedMoE):
if self.scoring_func != "softmax" and not self.use_grouped_topk: if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for " raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.") "non-grouped topk.")
if vllm_version_is("0.10.2"):
moe = FusedMoEConfig.make( moe = FusedMoEConfig.make(
num_experts=self.global_num_experts, num_experts=self.global_num_experts,
experts_per_token=top_k, experts_per_token=top_k,
@@ -287,7 +289,15 @@ class AscendFusedMoE(FusedMoE):
# TODO (bnell): this needs to be fixed for quantized types. # TODO (bnell): this needs to be fixed for quantized types.
in_dtype=params_dtype, in_dtype=params_dtype,
quant_config=quant_config) quant_config=quant_config)
else:
moe = FusedMoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
in_dtype=params_dtype,
)
self.moe_config = moe self.moe_config = moe
if quant_config is None: if quant_config is None:

View File

@@ -17,4 +17,6 @@
import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa
import vllm_ascend.patch.worker.patch_common.patch_logits # noqa import vllm_ascend.patch.worker.patch_common.patch_logits # noqa
import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa
# TODO: revert me when triton import is fixed
# import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa

View File

@@ -31,7 +31,7 @@ from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
from vllm_ascend.torchair.utils import (check_torchair_cache_exist, from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
delete_torchair_cache_file) delete_torchair_cache_file)
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p, from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p,
update_aclgraph_sizes) update_aclgraph_sizes, vllm_version_is)
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
@@ -128,9 +128,12 @@ class NPUPlatform(Platform):
model_config = vllm_config.model_config model_config = vllm_config.model_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
decoding_config = vllm_config.decoding_config
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
ascend_scheduler_config = ascend_config.ascend_scheduler_config ascend_scheduler_config = ascend_config.ascend_scheduler_config
if vllm_version_is("0.10.2"):
structured_outputs_config = vllm_config.decoding_config
else:
structured_outputs_config = vllm_config.structured_outputs_config
if model_config is not None and not model_config.use_mla: if model_config is not None and not model_config.use_mla:
logger.info( logger.info(
@@ -138,7 +141,7 @@ class NPUPlatform(Platform):
"as the performance of operators supporting this feature " "as the performance of operators supporting this feature "
"functionality is currently suboptimal.") "functionality is currently suboptimal.")
if not model_config.is_multimodal_model and \ if not model_config.is_multimodal_model and \
decoding_config.backend == "auto" and \ structured_outputs_config.backend == "auto" and \
not scheduler_config.delay_factor > 0 and \ not scheduler_config.delay_factor > 0 and \
not scheduler_config.send_delta_data and \ not scheduler_config.send_delta_data and \
scheduler_config.policy == "fcfs": scheduler_config.policy == "fcfs":

View File

@@ -404,6 +404,10 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
if hasattr(self.quant_method, "process_weights_after_loading"): if hasattr(self.quant_method, "process_weights_after_loading"):
self.quant_method.process_weights_after_loading(layer) self.quant_method.process_weights_after_loading(layer)
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
# TODO: implement this function
pass
class AscendEmbeddingMethod(AscendLinearMethod): class AscendEmbeddingMethod(AscendLinearMethod):
"""Embedding method for Ascend quantization. """Embedding method for Ascend quantization.

View File

@@ -24,7 +24,8 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
TorchairDeepSeekMTP TorchairDeepSeekMTP
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
TorchairCommonAttentionMetadata) TorchairCommonAttentionMetadata)
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
vllm_version_is)
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1
@@ -395,7 +396,10 @@ class MtpProposer(Proposer):
seq_lens=None) seq_lens=None)
if not self.torchair_graph_enabled: if not self.torchair_graph_enabled:
if vllm_version_is("0.10.2"):
builder = self.runner.attn_groups[0][0].metadata_builder builder = self.runner.attn_groups[0][0].metadata_builder
else:
builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata_mtp = builder.build(0, common_attn_metadata, attn_metadata_mtp = builder.build(0, common_attn_metadata,
self.runner.get_model()) self.runner.get_model())

View File

@@ -50,7 +50,8 @@ from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
get_all_reduce_merge_state, get_all_reduce_merge_state,
get_ascend_soc_version, get_ascend_soc_version,
get_rm_router_logits_state, is_310p) get_rm_router_logits_state, is_310p,
vllm_version_is)
def torchair_fused_experts_with_mc2( def torchair_fused_experts_with_mc2(
@@ -1057,6 +1058,8 @@ class TorchairAscendFusedMoE(FusedMoE):
if self.scoring_func != "softmax" and not self.use_grouped_topk: if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for " raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.") "non-grouped topk.")
if vllm_version_is("0.10.2"):
self.moe = FusedMoEConfig.make( self.moe = FusedMoEConfig.make(
num_experts=self.global_num_experts, num_experts=self.global_num_experts,
experts_per_token=top_k, experts_per_token=top_k,
@@ -1066,7 +1069,15 @@ class TorchairAscendFusedMoE(FusedMoE):
# TODO (bnell): this needs to be fixed for quantized types. # TODO (bnell): this needs to be fixed for quantized types.
in_dtype=params_dtype, in_dtype=params_dtype,
quant_config=quant_config) quant_config=quant_config)
else:
self.moe = FusedMoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
in_dtype=params_dtype,
)
if quant_config is None: if quant_config is None:
self.quant_method = TorchairAscendUnquantizedFusedMoEMethod( self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
self.moe) self.moe)

View File

@@ -112,7 +112,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
AscendSocVersion, ProfileExecuteDuration, AscendSocVersion, ProfileExecuteDuration,
get_ascend_soc_version, is_310p, get_ascend_soc_version, is_310p,
lmhead_tp_enable) lmhead_tp_enable, vllm_version_is)
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -569,11 +569,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
to_update = model.pooler.get_pooling_updates(task) to_update = model.pooler.get_pooling_updates(task)
to_update.apply(pooling_params) to_update.apply(pooling_params)
backward_kwargs = {}
if vllm_version_is("0.10.2"):
backward_kwargs["mm_kwargs"] = new_req_data.mm_kwargs
backward_kwargs["mm_hashes"] = new_req_data.mm_hashes
backward_kwargs["mm_positions"] = new_req_data.mm_positions
else:
backward_kwargs["mm_features"] = new_req_data.mm_features
self.requests[req_id] = CachedRequestState( self.requests[req_id] = CachedRequestState(
req_id=req_id, req_id=req_id,
prompt_token_ids=new_req_data.prompt_token_ids, prompt_token_ids=new_req_data.prompt_token_ids,
mm_kwargs=new_req_data.mm_kwargs,
mm_positions=new_req_data.mm_positions,
sampling_params=sampling_params, sampling_params=sampling_params,
pooling_params=pooling_params, pooling_params=pooling_params,
generator=generator, generator=generator,
@@ -581,46 +587,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_computed_tokens=new_req_data.num_computed_tokens, num_computed_tokens=new_req_data.num_computed_tokens,
output_token_ids=[], output_token_ids=[],
lora_request=new_req_data.lora_request, lora_request=new_req_data.lora_request,
mm_hashes=new_req_data.mm_hashes, **backward_kwargs,
) )
# Only relevant for models using M-RoPE (e.g, Qwen2-VL) # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if self.uses_mrope: if self.uses_mrope:
image_grid_thw = [] if vllm_version_is("0.10.2"):
video_grid_thw = [] self._init_mrope_positions_0102(self.requests[req_id])
second_per_grid_ts = [] else:
audio_feature_lengths = [] self._init_mrope_positions(self.requests[req_id])
use_audio_in_video = False
for mm_item in self.requests[req_id].mm_kwargs:
mm_input = mm_item.get_data()
if mm_input.get("image_grid_thw") is not None:
image_grid_thw.append(
mm_input["image_grid_thw"].tolist())
if mm_input.get("video_grid_thw") is not None:
video_grid_thw.append(
mm_input["video_grid_thw"].tolist())
if mm_input.get("second_per_grid_ts") is not None:
second_per_grid_ts.append(
mm_input["second_per_grid_ts"])
if mm_input.get("audio_feature_lengths") is not None:
audio_feature_lengths.append(
mm_input["audio_feature_lengths"])
if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True
hf_config = self.model_config.hf_config
self.requests[req_id].mrope_positions, \
self.requests[req_id].mrope_position_delta = \
MRotaryEmbedding.get_input_positions_tensor(
self.requests[req_id].prompt_token_ids,
hf_config=hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
)
req_ids_to_add.append(req_id) req_ids_to_add.append(req_id)
@@ -718,6 +693,73 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Refresh batch metadata with any pending updates. # Refresh batch metadata with any pending updates.
self.input_batch.refresh_metadata() self.input_batch.refresh_metadata()
def _init_mrope_positions(self, req_state: CachedRequestState):
image_grid_thw = []
video_grid_thw = []
second_per_grid_ts = []
audio_feature_lengths = []
use_audio_in_video = False
assert req_state.mm_features is not None
for mm_feature in req_state.mm_features:
mm_item = mm_feature.data
if mm_item is None:
continue
mm_input = mm_item.get_data()
if (t := mm_input.get("image_grid_thw")) is not None:
image_grid_thw.append(t.tolist())
if (t := mm_input.get("video_grid_thw")) is not None:
video_grid_thw.append(t.tolist())
if (t := mm_input.get("second_per_grid_ts")) is not None:
second_per_grid_ts.append(t)
if (t := mm_input.get("audio_feature_lengths")) is not None:
audio_feature_lengths.append(t)
if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True
req_state.mrope_positions, req_state.mrope_position_delta = \
MRotaryEmbedding.get_input_positions_tensor(
req_state.prompt_token_ids,
hf_config=self.model_config.hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
)
def _init_mrope_positions_0102(self, req_state: CachedRequestState):
image_grid_thw = []
video_grid_thw = []
second_per_grid_ts = []
audio_feature_lengths = []
use_audio_in_video = False
assert req_state.mm_kwargs is not None
for mm_item in req_state.mm_kwargs:
mm_input = mm_item.get_data()
if mm_input.get("image_grid_thw") is not None:
image_grid_thw.append(mm_input["image_grid_thw"].tolist())
if mm_input.get("video_grid_thw") is not None:
video_grid_thw.append(mm_input["video_grid_thw"].tolist())
if mm_input.get("second_per_grid_ts") is not None:
second_per_grid_ts.append(mm_input["second_per_grid_ts"])
if mm_input.get("audio_feature_lengths") is not None:
audio_feature_lengths.append(mm_input["audio_feature_lengths"])
if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True
hf_config = self.model_config.hf_config
req_state.mrope_positions, req_state.mrope_position_delta = \
MRotaryEmbedding.get_input_positions_tensor(
req_state.prompt_token_ids,
hf_config=hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
)
def _sync_metadata_across_dp( def _sync_metadata_across_dp(
self, num_tokens: int, with_prefill: bool, enable_dbo: bool self, num_tokens: int, with_prefill: bool, enable_dbo: bool
) -> tuple[int, Optional[torch.Tensor], bool, bool]: ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
@@ -888,23 +930,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
return return
# Batch the multi-modal inputs. # Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]() if vllm_version_is("0.10.2"):
mm_hashes_pos = list[tuple[str, PlaceholderRange]]() mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler_0102(
for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): scheduler_output)
req_state = self.requests[req_id] else:
for mm_input_id in encoder_input_ids: mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
mm_hash = req_state.mm_hashes[mm_input_id] scheduler_output)
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
mm_hashes_pos.append(
(mm_hash, req_state.mm_positions[mm_input_id]))
# Batch mm inputs as much as we can: if a request in the batch has
# multiple modalities or a different modality than the previous one,
# we process it separately to preserve item order.
# FIXME(ywang96): This is a hacky way to deal with multiple modalities
# in the same batch while still being able to benefit from batching
# multimodal inputs. The proper solution should be reordering the
# encoder outputs.
encoder_outputs = [] encoder_outputs = []
for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
mm_kwargs, mm_kwargs,
device=self.device, device=self.device,
@@ -934,32 +967,100 @@ class NPUModelRunner(LoRAModelRunnerMixin):
is_embed=pos_info.is_embed, is_embed=pos_info.is_embed,
) )
# TODO: remove this once we drop support for vLLM 0.10.2
def _batch_mm_kwargs_from_scheduler_0102(
self,
scheduler_output: "SchedulerOutput",
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
if not scheduled_encoder_inputs:
return [], []
# Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]()
# list of tuple (mm_hash, position_info)
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id]
assert req_state.mm_hashes is not None
assert req_state.mm_kwargs is not None
assert req_state.mm_positions is not None
for mm_input_id in encoder_input_ids:
mm_hash = req_state.mm_hashes[mm_input_id]
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
mm_hashes_pos.append(
(mm_hash, req_state.mm_positions[mm_input_id]))
return mm_kwargs, mm_hashes_pos
def _batch_mm_kwargs_from_scheduler(
self,
scheduler_output: "SchedulerOutput",
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
"""Batch multimodal kwargs from scheduled encoder inputs.
Args:
scheduler_output: The scheduler output containing scheduled encoder
inputs.
Returns:
A tuple of (mm_kwargs, req_ids_pos) where:
- mm_kwargs: List of multimodal kwargs items to be batched
- mm_hashes_pos: List of (mm_hash, position_info) tuples
"""
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
if not scheduled_encoder_inputs:
return [], []
# Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]()
# list of tuple (mm_hash, position_info)
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id]
assert req_state.mm_features is not None
for mm_input_id in encoder_input_ids:
mm_feature = req_state.mm_features[mm_input_id]
mm_hash = mm_feature.identifier
mm_kwargs.append(mm_feature.data)
mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
return mm_kwargs, mm_hashes_pos
def _gather_mm_embeddings( def _gather_mm_embeddings(
self, self,
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
) -> list[torch.Tensor]: ) -> list[torch.Tensor]:
def _iter_mm_features(req_state: CachedRequestState):
if vllm_version_is("0.10.2"):
# legacy path (to be removed later)
assert req_state.mm_hashes is not None
assert req_state.mm_positions is not None
for mm_hash, pos_info in zip(req_state.mm_hashes,
req_state.mm_positions):
yield mm_hash, pos_info, getattr(pos_info, "is_embed",
None)
else:
assert req_state.mm_features is not None
for mm_feature in req_state.mm_features:
pos_info = mm_feature.mm_position
yield mm_feature.identifier, pos_info, getattr(
pos_info, "is_embed", None)
mm_embeds: list[torch.Tensor] = [] mm_embeds: list[torch.Tensor] = []
for req_id in self.input_batch.req_ids: for req_id in self.input_batch.req_ids:
num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
req_id] req_id]
req_state = self.requests[req_id] req_state = self.requests[req_id]
num_computed_tokens = req_state.num_computed_tokens num_computed_tokens = req_state.num_computed_tokens
mm_positions = req_state.mm_positions
mm_hashes = req_state.mm_hashes for mm_hash, pos_info, is_embed in _iter_mm_features(req_state):
for i, pos_info in enumerate(mm_positions):
start_pos = pos_info.offset start_pos = pos_info.offset
num_encoder_tokens = pos_info.length num_encoder_tokens = pos_info.length
# The encoder output is needed if the two ranges overlap:
# [num_computed_tokens,
# num_computed_tokens + num_scheduled_tokens) and
# [start_pos, start_pos + num_encoder_tokens)
if start_pos >= num_computed_tokens + num_scheduled_tokens: if start_pos >= num_computed_tokens + num_scheduled_tokens:
# The encoder output is not needed in this step.
break break
if start_pos + num_encoder_tokens <= num_computed_tokens: if start_pos + num_encoder_tokens <= num_computed_tokens:
# The encoder output is already processed and stored
# in the decoder's KV cache.
continue continue
start_idx = max(num_computed_tokens - start_pos, 0) start_idx = max(num_computed_tokens - start_pos, 0)
@@ -968,12 +1069,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_encoder_tokens, num_encoder_tokens,
) )
assert start_idx < end_idx assert start_idx < end_idx
mm_hash = mm_hashes[i]
encoder_output = self.encoder_cache.get(mm_hash, None) encoder_output = self.encoder_cache.get(mm_hash, None)
assert encoder_output is not None,\ assert encoder_output is not None, \
f"Encoder cache miss for {mm_hash}." f"Encoder cache miss for {mm_hash}."
if (is_embed := pos_info.is_embed) is not None: if is_embed is not None:
is_embed = is_embed[start_idx:end_idx] is_embed = is_embed[start_idx:end_idx]
mm_embeds_item = gather_mm_placeholders( mm_embeds_item = gather_mm_placeholders(
@@ -1393,7 +1494,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for attn_group in self.attn_groups[kv_cache_group_id]: for attn_group in self.attn_groups[kv_cache_group_id]:
common_prefix_len = 0 common_prefix_len = 0
extra_attn_metadata_args = {} extra_attn_metadata_args = {}
if vllm_version_is("0.10.2"):
builder = attn_group.metadata_builder builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
if isinstance(builder, GDNAttentionMetadataBuilder): if isinstance(builder, GDNAttentionMetadataBuilder):
if use_spec_decode: if use_spec_decode:
extra_attn_metadata_args = dict( extra_attn_metadata_args = dict(
@@ -2828,7 +2932,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for k, v in attn_backend_layers.items() for k, v in attn_backend_layers.items()
} }
def create_attn_groups( def create_attn_groups_v0102(
attn_backends_map: dict[AttentionBackend, list[str]], attn_backends_map: dict[AttentionBackend, list[str]],
kv_cache_spec: KVCacheSpec, kv_cache_spec: KVCacheSpec,
) -> list[AttentionGroup]: ) -> list[AttentionGroup]:
@@ -2846,10 +2950,33 @@ class NPUModelRunner(LoRAModelRunnerMixin):
attn_groups.append(attn_group) attn_groups.append(attn_group)
return attn_groups return attn_groups
def create_attn_groups(
attn_backends_map: dict[AttentionBackend, list[str]],
kv_cache_spec: KVCacheSpec,
) -> list[AttentionGroup]:
attn_groups: list[AttentionGroup] = []
for attn_backend, layer_names in attn_backends_map.items():
attn_metadata_builders = []
attn_metadata_builders.append(attn_backend.get_builder_cls()(
kv_cache_spec,
layer_names,
self.vllm_config,
self.device,
))
attn_group = AttentionGroup(attn_backend,
attn_metadata_builders,
layer_names)
attn_groups.append(attn_group)
return attn_groups
for kv_cache_group_spec in kv_cache_config.kv_cache_groups: for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
kv_cache_spec = kv_cache_group_spec.kv_cache_spec kv_cache_spec = kv_cache_group_spec.kv_cache_spec
attn_backends = get_attn_backends_for_layers( attn_backends = get_attn_backends_for_layers(
kv_cache_group_spec.layer_names) kv_cache_group_spec.layer_names)
if vllm_version_is("0.10.2"):
self.attn_groups.append(
create_attn_groups_v0102(attn_backends, kv_cache_spec))
else:
self.attn_groups.append( self.attn_groups.append(
create_attn_groups(attn_backends, kv_cache_spec)) create_attn_groups(attn_backends, kv_cache_spec))
@@ -2865,7 +2992,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
is compatible (e.g., decode threshold is the same) is compatible (e.g., decode threshold is the same)
""" """
for group in self._attn_group_iterator(): for group in self._attn_group_iterator():
if vllm_version_is("0.10.2"):
attn_metadata_builder_i = group.metadata_builder attn_metadata_builder_i = group.metadata_builder
else:
attn_metadata_builder_i = group.get_metadata_builder()
if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"): if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"):
# check that if any backends reorder batches; that the reordering # check that if any backends reorder batches; that the reordering
# is compatible (e.g., decode threshold is the same) # is compatible (e.g., decode threshold is the same)

View File

@@ -24,8 +24,9 @@ import numpy as np
import torch import torch
from typing_extensions import deprecated from typing_extensions import deprecated
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem, from vllm.multimodal.inputs import (MultiModalFeatureSpec,
PlaceholderRange) MultiModalKwargsItem,
MultiModalKwargsItems, PlaceholderRange)
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams, SamplingType from vllm.sampling_params import SamplingParams, SamplingType
from vllm.utils import swap_dict_values from vllm.utils import swap_dict_values
@@ -38,6 +39,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
from vllm.v1.utils import copy_slice from vllm.v1.utils import copy_slice
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import MultiGroupBlockTable from vllm_ascend.worker.block_table import MultiGroupBlockTable
@@ -46,9 +48,6 @@ class CachedRequestState:
req_id: str req_id: str
prompt_token_ids: list[int] prompt_token_ids: list[int]
mm_kwargs: list[MultiModalKwargsItem]
mm_positions: list[PlaceholderRange]
mm_hashes: list[str]
sampling_params: Optional[SamplingParams] sampling_params: Optional[SamplingParams]
pooling_params: Optional[PoolingParams] pooling_params: Optional[PoolingParams]
generator: Optional[torch.Generator] generator: Optional[torch.Generator]
@@ -60,6 +59,12 @@ class CachedRequestState:
mrope_positions: Optional[torch.Tensor] = None mrope_positions: Optional[torch.Tensor] = None
mrope_position_delta: Optional[int] = None mrope_position_delta: Optional[int] = None
mm_features: Optional[list[MultiModalFeatureSpec]] = None
# for back-compatibility, will be removed in next major release
mm_kwargs: Optional[list[MultiModalKwargsItem]] = None
mm_positions: Optional[list[PlaceholderRange]] = None
mm_hashes: Optional[list[PlaceholderRange]] = None
lora_request: Optional[LoRARequest] = None lora_request: Optional[LoRARequest] = None
def __post_init__(self): def __post_init__(self):
@@ -73,8 +78,18 @@ class CachedRequestState:
@property @property
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
"removed in v0.13. Please use `mm_kwargs` instead.") "removed in v0.13. Please use `mm_kwargs` instead.")
def mm_inputs(self) -> list[MultiModalKwargs]: def mm_inputs(self) -> list[MultiModalKwargsItems]:
return [MultiModalKwargs([item]) for item in self.mm_kwargs] if vllm_version_is("0.10.2"):
assert self.mm_kwargs is not None
return [
MultiModalKwargsItems.from_seq([item])
for item in self.mm_kwargs
]
assert self.mm_features is not None
return [
MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features
if f.data is not None
]
def get_token_id(self, idx: int) -> int: def get_token_id(self, idx: int) -> int:
if idx < self.num_prompt_tokens: if idx < self.num_prompt_tokens: