[CI] Upgrade vLLM to 20250919 (6d8246aa) and fix some broken issue (#2907)

### What this PR does / why we need it?
1. This pr bump vllm commit to
6d8246aaff
2. fix upstream changes https://github.com/vllm-project/vllm/pull/24548
abort multi-modal kwargs, make vllm main and `v0.10.2` both adaptable
3. fix metadata_builder changes introduced by
https://github.com/vllm-project/vllm/pull/23693
4. fix `structured_outputs_config` changes introduced by
https://github.com/vllm-project/vllm/pull/22772
5. fix `moe_config` changes introduced by
https://github.com/vllm-project/vllm/pull/22537

Co-authored-by:  MengqingCao <cmq0113@163.com>
Co-authored-by:  Yikun Jiang <yikunkero@gmail.com>


- vLLM version: v0.10.2
- vLLM main:
c60e6137f0

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Li Wang
2025-09-20 17:37:57 +08:00
committed by GitHub
parent 53ecd89e8f
commit 12bcbd02bb
14 changed files with 359 additions and 143 deletions

View File

@@ -18,12 +18,20 @@
#
import json
import os
from typing import Any, Dict
import jsonschema
import pytest
import regex as re
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
else:
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from tests.e2e.conftest import VllmRunner
@@ -84,16 +92,29 @@ def sample_json_schema():
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_json_completion(guided_decoding_backend: str,
sample_json_schema):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
with VllmRunner(
MODEL_NAME,
seed=0,
guided_decoding_backend=guided_decoding_backend,
) as vllm_model:
runner_kwargs: Dict[str, Any] = {}
if vllm_version_is("0.10.2"):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
runner_kwargs = {
"seed": 0,
"guided_decoding_backend": guided_decoding_backend,
}
else:
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
structured_outputs=StructuredOutputsParams(
json=sample_json_schema))
runner_kwargs = {
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
}
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
@@ -121,17 +142,29 @@ def test_guided_json_completion(guided_decoding_backend: str,
def test_guided_regex(guided_decoding_backend: str, sample_regex):
if guided_decoding_backend == "outlines":
pytest.skip("Outlines doesn't support regex-based guided decoding.")
runner_kwargs: Dict[str, Any] = {}
if vllm_version_is("0.10.2"):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
runner_kwargs = {
"seed": 0,
"guided_decoding_backend": guided_decoding_backend,
}
else:
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
structured_outputs=StructuredOutputsParams(regex=sample_regex))
runner_kwargs = {
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
}
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
with VllmRunner(
MODEL_NAME,
seed=0,
guided_decoding_backend=guided_decoding_backend,
) as vllm_model:
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [
f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2

View File

@@ -231,6 +231,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
expert_weights: torch.Tensor) -> torch.Tensor:
pass
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
pass
class TestAscendFusedMoe:

View File

@@ -197,6 +197,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
expert_weights: torch.Tensor) -> torch.Tensor:
pass
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
pass
class TestTorchairAscendFusedMoe: