[Test] Add initial multi modal cases of Qwen2.5-VL-7B-Instruct for disaggregated encoder (#5301)

### What this PR does / why we need it?
This PR adds disaggregated encoder  tests for Qwen2.5-VL-7B-Instruct 
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
by running the test
by running ci

- vLLM version: release/v0.12.0

---------

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
Signed-off-by: wangyu <53896905+yenuo26@users.noreply.github.com>
Co-authored-by: wangyu31577 <wangyu31577@hundsun.com>
This commit is contained in:
wangyu
2026-02-06 17:30:17 +08:00
committed by GitHub
parent 06c0aed124
commit c63b7a1188
8 changed files with 1361 additions and 1 deletions

View File

@@ -18,6 +18,7 @@
#
import contextlib
import copy
import functools
import gc
import json
@@ -27,11 +28,15 @@ import os
import shlex
import subprocess
import sys
import threading
import time
import traceback
from pathlib import Path
from typing import Any, Optional, Tuple, TypeVar, Union
import numpy as np
import openai
import psutil
import pytest
import requests
import torch
@@ -80,6 +85,10 @@ logger = logging.getLogger(__name__)
_TEST_DIR = os.path.dirname(__file__)
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]
DISAGG_EPD_PROXY_SCRIPT = Path(
__file__
).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
import torch_npu # type: ignore
@@ -441,6 +450,216 @@ class RemoteOpenAIServer:
**kwargs)
class RemoteEPDServer(RemoteOpenAIServer):
def _start_server(self, model: str, server_cmd: list[str],
env_dict: Optional[dict[str, str]]) -> None:
"""Subclasses override this method to customize server process launch
"""
raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
def __init__(self,
vllm_serve_args: Union[list[str], list[list[str]]],
server_host: str = '0.0.0.0',
env_dict: Optional[dict[str, str]] = None,
max_wait_seconds: Optional[float] = 2800) -> None:
self._proc_list = []
self.env_dict: dict[str, str] = {}
if env_dict is not None:
self.env_dict.update(env_dict)
self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1"
self.env_dict['VLLM_USE_V1'] = "1"
self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True"
self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
self.vllm_serve_args_list = []
self.health_url_list = []
self.host = server_host
if isinstance(vllm_serve_args, list):
if not all(isinstance(item, list) for item in vllm_serve_args):
args_copy = copy.deepcopy(vllm_serve_args)
self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
else:
self.vllm_serve_args_list = [
[str(arg) for arg in sublist]
for sublist in copy.deepcopy(vllm_serve_args)
]
else:
raise RuntimeError("vllm_serves_args must be a list")
serve_arg_cmd = ["vllm", "serve"]
for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i)
if isinstance(vllm_serve_arg, list):
if "--port" not in vllm_serve_arg:
raise ValueError("You have manually specified the port ")
else:
port_arg = "--port"
try:
index = vllm_serve_arg.index(port_arg)
except ValueError:
raise ValueError(f"--port not found in args: {vllm_serve_arg}")
port_str = vllm_serve_arg[index + 1]
self.port = int(port_str)
else:
vllm_serve_arg_str = str(vllm_serve_arg)
if "--port" not in vllm_serve_arg_str:
raise ValueError("You have manually specified the port ")
else:
raise ValueError(f"Unexpected type for vllm_serve_arg: {type(vllm_serve_arg)}")
self.health_url_list.append(super().url_for("health"))
vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict,
f"[VLLM_{i}] ")
self._proc_list.append(proc)
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
super()._wait_for_multiple_servers([(self.host, url)
for url in self.health_url_list],
timeout=timeout_value)
def _poll(self) -> Optional[int]:
return None
def _delete_shm(self) -> None:
for i, arg in enumerate(self.vllm_serve_args_list):
if "--ec-transfer-config" in arg:
index = arg.index("--ec-transfer-config")
config_str = arg[index + 1]
config_dict = json.loads(config_str)
ec_connector_extra_config = config_dict.get("ec_connector_extra_config", {})
shm_path = ec_connector_extra_config.get("shared_storage_path")
if shm_path:
args = ["rm", "-r", "-f", str(shm_path)]
print(f"delete shm_path is: {shm_path}")
self._start_server_with_prefix(args, None, "[DELETE] ")
def _read_output(self, pipe, prefix):
try:
with pipe:
for line in iter(pipe.readline, ''):
if line:
print(f"{prefix}: {line}", end='')
except Exception as e:
print(f"error: {e}")
traceback.print_exc()
def _start_server_with_prefix(self, server_cmd: list[str],
env_dict: Optional[dict[str, str]], log_prefix: str):
env = os.environ.copy()
if env_dict is not None:
env.update(env_dict)
proc = subprocess.Popen(server_cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
bufsize=1)
stdout_thread = threading.Thread(target=self._read_output,
args=(proc.stdout, log_prefix),
daemon=True)
stderr_thread = threading.Thread(target=self._read_output,
args=(proc.stderr, log_prefix),
daemon=True)
stdout_thread.start()
stderr_thread.start()
return proc
def _terminate_server(self) -> None:
"""kill process and its children"""
print("vllm instance is stopping")
for proc in self._proc_list:
parent = psutil.Process(proc.pid)
children = parent.children(recursive=True)
for child in children:
try:
child.terminate()
except psutil.NoSuchProcess:
pass
gone, still_alive = psutil.wait_procs(children, timeout=10)
for child in still_alive:
try:
child.kill()
except psutil.NoSuchProcess:
pass
try:
parent.terminate()
parent.wait(timeout=10)
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
try:
parent.kill()
except psutil.NoSuchProcess:
pass
def __enter__(self):
"""Context manager entry point."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit point - clean up all processes."""
self._terminate_server()
class DisaggEpdProxy(RemoteEPDServer):
def __init__(self,
proxy_args: Optional[Union[list[str], str]] = None,
env_dict: Optional[dict[str, str]] = None,
server_host: str = '0.0.0.0',
max_wait_seconds: Optional[float] = 2800) -> None:
if proxy_args is None:
proxy_args_list: list[str] = []
elif isinstance(proxy_args, str):
proxy_args_list = shlex.split(proxy_args)
else:
proxy_args_list = proxy_args
self.proxy_args = proxy_args_list
self.env_dict: dict[str, str] = {}
if env_dict is not None:
self.env_dict.update(env_dict)
self._proc_list = list()
self.host = server_host
print(f"proxy param is: {self.proxy_args}")
proxy_cmd = ["python", str(DISAGG_EPD_PROXY_SCRIPT), *self.proxy_args]
proc = self._start_server_with_prefix(proxy_cmd, self.env_dict, "[PROXY] ")
self._proc_list.append(proc)
if "--port" not in self.proxy_args:
raise ValueError("You have manually specified the port ")
else:
try:
index = self.proxy_args.index("--port")
except ValueError:
raise ValueError("--port not found in proxy args")
port_str = self.proxy_args[index + 1]
self.port = int(port_str)
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
super()._wait_for_multiple_servers(
[(self.host, super().url_for("health"))], timeout=timeout_value)
def __enter__(self):
"""Context manager entry point."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit point - clean up all processes."""
super()._terminate_server()
class VllmRunner:
def __init__(

View File

@@ -0,0 +1,71 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
from tools.send_mm_request import send_image_request
MODELS = [
"Qwen/Qwen2.5-VL-7B-Instruct",
]
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
TENSOR_PARALLELS = [1]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
encode_port = get_open_port()
pd_port = get_open_port()
vllm_server_args = [
[
"--port",
str(encode_port), "--model", model, "--gpu-memory-utilization",
"0.01", "--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
"--max-num-seqs", "1", "--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
],
[
"--port",
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
"--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
"--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
]
]
proxy_port = get_open_port()
proxy_args = [
"--host", "127.0.0.1", "--port",
str(proxy_port), "--encode-servers-urls",
f"http://localhost:{encode_port}", "--decode-servers-urls",
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
]
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
send_image_request(model, proxy)

View File

@@ -0,0 +1,110 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"Qwen/Qwen2.5-VL-7B-Instruct",
]
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
TENSOR_PARALLELS = [1]
warmup_cases = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"num_prompts": 50,
"max_out_len": 20,
"batch_size": 32,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/textvqa-lite",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"max_out_len": 2048,
"batch_size": 128,
"baseline": 82.05,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"num_prompts": 512,
"max_out_len": 256,
"batch_size": 128,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
encode_port = get_open_port()
pd_port = get_open_port()
vllm_server_args = [
[
"--port",
str(encode_port), "--model", model, "--gpu-memory-utilization",
"0.01", "--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
"--max-num-seqs", "1", "--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
],
[
"--port",
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
"--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
"--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
]
]
proxy_port = get_open_port()
proxy_args = [
"--host", "127.0.0.1", "--port",
str(proxy_port), "--encode-servers-urls",
f"http://localhost:{encode_port}", "--decode-servers-urls",
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
]
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
with DisaggEpdProxy(proxy_args=proxy_args) as _:
# warm up
run_aisbench_cases(model=model,
port=proxy_port,
aisbench_cases=warmup_cases)
# aisbench test
run_aisbench_cases(model, proxy_port, aisbench_cases)