[Test] Add initial multi modal cases of Qwen2.5-VL-7B-Instruct for disaggregated encoder (#5301)
### What this PR does / why we need it? This PR adds disaggregated encoder tests for Qwen2.5-VL-7B-Instruct ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? by running the test by running ci - vLLM version: release/v0.12.0 --------- Signed-off-by: wangyu31577 <wangyu31577@hundsun.com> Signed-off-by: wangyu <53896905+yenuo26@users.noreply.github.com> Co-authored-by: wangyu31577 <wangyu31577@hundsun.com>
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
#
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import functools
|
||||
import gc
|
||||
import json
|
||||
@@ -27,11 +28,15 @@ import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Tuple, TypeVar, Union
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
import psutil
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
@@ -80,6 +85,10 @@ logger = logging.getLogger(__name__)
|
||||
_TEST_DIR = os.path.dirname(__file__)
|
||||
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]
|
||||
|
||||
DISAGG_EPD_PROXY_SCRIPT = Path(
|
||||
__file__
|
||||
).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
|
||||
|
||||
|
||||
def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
|
||||
import torch_npu # type: ignore
|
||||
@@ -441,6 +450,216 @@ class RemoteOpenAIServer:
|
||||
**kwargs)
|
||||
|
||||
|
||||
class RemoteEPDServer(RemoteOpenAIServer):
|
||||
def _start_server(self, model: str, server_cmd: list[str],
|
||||
env_dict: Optional[dict[str, str]]) -> None:
|
||||
"""Subclasses override this method to customize server process launch
|
||||
"""
|
||||
raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
|
||||
|
||||
def __init__(self,
|
||||
vllm_serve_args: Union[list[str], list[list[str]]],
|
||||
server_host: str = '0.0.0.0',
|
||||
env_dict: Optional[dict[str, str]] = None,
|
||||
max_wait_seconds: Optional[float] = 2800) -> None:
|
||||
|
||||
self._proc_list = []
|
||||
|
||||
self.env_dict: dict[str, str] = {}
|
||||
if env_dict is not None:
|
||||
self.env_dict.update(env_dict)
|
||||
|
||||
self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1"
|
||||
self.env_dict['VLLM_USE_V1'] = "1"
|
||||
self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True"
|
||||
self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
|
||||
self.vllm_serve_args_list = []
|
||||
self.health_url_list = []
|
||||
self.host = server_host
|
||||
|
||||
if isinstance(vllm_serve_args, list):
|
||||
if not all(isinstance(item, list) for item in vllm_serve_args):
|
||||
args_copy = copy.deepcopy(vllm_serve_args)
|
||||
self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
|
||||
else:
|
||||
self.vllm_serve_args_list = [
|
||||
[str(arg) for arg in sublist]
|
||||
for sublist in copy.deepcopy(vllm_serve_args)
|
||||
]
|
||||
else:
|
||||
raise RuntimeError("vllm_serves_args must be a list")
|
||||
|
||||
serve_arg_cmd = ["vllm", "serve"]
|
||||
|
||||
for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
|
||||
self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i)
|
||||
if isinstance(vllm_serve_arg, list):
|
||||
if "--port" not in vllm_serve_arg:
|
||||
raise ValueError("You have manually specified the port ")
|
||||
else:
|
||||
port_arg = "--port"
|
||||
try:
|
||||
index = vllm_serve_arg.index(port_arg)
|
||||
except ValueError:
|
||||
raise ValueError(f"--port not found in args: {vllm_serve_arg}")
|
||||
port_str = vllm_serve_arg[index + 1]
|
||||
self.port = int(port_str)
|
||||
else:
|
||||
vllm_serve_arg_str = str(vllm_serve_arg)
|
||||
if "--port" not in vllm_serve_arg_str:
|
||||
raise ValueError("You have manually specified the port ")
|
||||
else:
|
||||
raise ValueError(f"Unexpected type for vllm_serve_arg: {type(vllm_serve_arg)}")
|
||||
|
||||
self.health_url_list.append(super().url_for("health"))
|
||||
vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
|
||||
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict,
|
||||
f"[VLLM_{i}] ")
|
||||
self._proc_list.append(proc)
|
||||
|
||||
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
||||
super()._wait_for_multiple_servers([(self.host, url)
|
||||
for url in self.health_url_list],
|
||||
timeout=timeout_value)
|
||||
|
||||
def _poll(self) -> Optional[int]:
|
||||
return None
|
||||
|
||||
def _delete_shm(self) -> None:
|
||||
for i, arg in enumerate(self.vllm_serve_args_list):
|
||||
if "--ec-transfer-config" in arg:
|
||||
index = arg.index("--ec-transfer-config")
|
||||
config_str = arg[index + 1]
|
||||
config_dict = json.loads(config_str)
|
||||
ec_connector_extra_config = config_dict.get("ec_connector_extra_config", {})
|
||||
shm_path = ec_connector_extra_config.get("shared_storage_path")
|
||||
if shm_path:
|
||||
args = ["rm", "-r", "-f", str(shm_path)]
|
||||
print(f"delete shm_path is: {shm_path}")
|
||||
self._start_server_with_prefix(args, None, "[DELETE] ")
|
||||
|
||||
def _read_output(self, pipe, prefix):
|
||||
try:
|
||||
with pipe:
|
||||
for line in iter(pipe.readline, ''):
|
||||
if line:
|
||||
print(f"{prefix}: {line}", end='')
|
||||
|
||||
except Exception as e:
|
||||
print(f"error: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
def _start_server_with_prefix(self, server_cmd: list[str],
|
||||
env_dict: Optional[dict[str, str]], log_prefix: str):
|
||||
env = os.environ.copy()
|
||||
if env_dict is not None:
|
||||
env.update(env_dict)
|
||||
proc = subprocess.Popen(server_cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True,
|
||||
bufsize=1)
|
||||
stdout_thread = threading.Thread(target=self._read_output,
|
||||
args=(proc.stdout, log_prefix),
|
||||
daemon=True)
|
||||
stderr_thread = threading.Thread(target=self._read_output,
|
||||
args=(proc.stderr, log_prefix),
|
||||
daemon=True)
|
||||
|
||||
stdout_thread.start()
|
||||
stderr_thread.start()
|
||||
return proc
|
||||
|
||||
def _terminate_server(self) -> None:
|
||||
"""kill process and its children"""
|
||||
print("vllm instance is stopping")
|
||||
for proc in self._proc_list:
|
||||
parent = psutil.Process(proc.pid)
|
||||
children = parent.children(recursive=True)
|
||||
for child in children:
|
||||
try:
|
||||
child.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
gone, still_alive = psutil.wait_procs(children, timeout=10)
|
||||
|
||||
for child in still_alive:
|
||||
try:
|
||||
child.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
try:
|
||||
parent.terminate()
|
||||
parent.wait(timeout=10)
|
||||
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
||||
try:
|
||||
parent.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry point."""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit point - clean up all processes."""
|
||||
self._terminate_server()
|
||||
|
||||
|
||||
class DisaggEpdProxy(RemoteEPDServer):
|
||||
|
||||
def __init__(self,
|
||||
proxy_args: Optional[Union[list[str], str]] = None,
|
||||
env_dict: Optional[dict[str, str]] = None,
|
||||
server_host: str = '0.0.0.0',
|
||||
max_wait_seconds: Optional[float] = 2800) -> None:
|
||||
|
||||
if proxy_args is None:
|
||||
proxy_args_list: list[str] = []
|
||||
elif isinstance(proxy_args, str):
|
||||
proxy_args_list = shlex.split(proxy_args)
|
||||
else:
|
||||
proxy_args_list = proxy_args
|
||||
|
||||
self.proxy_args = proxy_args_list
|
||||
self.env_dict: dict[str, str] = {}
|
||||
if env_dict is not None:
|
||||
self.env_dict.update(env_dict)
|
||||
self._proc_list = list()
|
||||
self.host = server_host
|
||||
|
||||
print(f"proxy param is: {self.proxy_args}")
|
||||
proxy_cmd = ["python", str(DISAGG_EPD_PROXY_SCRIPT), *self.proxy_args]
|
||||
proc = self._start_server_with_prefix(proxy_cmd, self.env_dict, "[PROXY] ")
|
||||
self._proc_list.append(proc)
|
||||
|
||||
if "--port" not in self.proxy_args:
|
||||
raise ValueError("You have manually specified the port ")
|
||||
else:
|
||||
try:
|
||||
index = self.proxy_args.index("--port")
|
||||
except ValueError:
|
||||
raise ValueError("--port not found in proxy args")
|
||||
port_str = self.proxy_args[index + 1]
|
||||
self.port = int(port_str)
|
||||
|
||||
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
||||
super()._wait_for_multiple_servers(
|
||||
[(self.host, super().url_for("health"))], timeout=timeout_value)
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry point."""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit point - clean up all processes."""
|
||||
super()._terminate_server()
|
||||
|
||||
|
||||
class VllmRunner:
|
||||
|
||||
def __init__(
|
||||
|
||||
71
tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
Normal file
71
tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
|
||||
from tools.send_mm_request import send_image_request
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
]
|
||||
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
||||
TENSOR_PARALLELS = [1]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
encode_port = get_open_port()
|
||||
pd_port = get_open_port()
|
||||
vllm_server_args = [
|
||||
[
|
||||
"--port",
|
||||
str(encode_port), "--model", model, "--gpu-memory-utilization",
|
||||
"0.01", "--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
|
||||
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
|
||||
"--max-num-seqs", "1", "--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
||||
],
|
||||
[
|
||||
"--port",
|
||||
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
|
||||
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
||||
]
|
||||
]
|
||||
proxy_port = get_open_port()
|
||||
proxy_args = [
|
||||
"--host", "127.0.0.1", "--port",
|
||||
str(proxy_port), "--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}", "--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
|
||||
]
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
|
||||
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
||||
send_image_request(model, proxy)
|
||||
|
||||
110
tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py
Normal file
110
tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
]
|
||||
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
||||
TENSOR_PARALLELS = [1]
|
||||
|
||||
warmup_cases = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"num_prompts": 50,
|
||||
"max_out_len": 20,
|
||||
"batch_size": 32,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/textvqa-lite",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"max_out_len": 2048,
|
||||
"batch_size": 128,
|
||||
"baseline": 82.05,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"num_prompts": 512,
|
||||
"max_out_len": 256,
|
||||
"batch_size": 128,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
encode_port = get_open_port()
|
||||
pd_port = get_open_port()
|
||||
vllm_server_args = [
|
||||
[
|
||||
"--port",
|
||||
str(encode_port), "--model", model, "--gpu-memory-utilization",
|
||||
"0.01", "--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
|
||||
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
|
||||
"--max-num-seqs", "1", "--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
||||
],
|
||||
[
|
||||
"--port",
|
||||
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
|
||||
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
||||
]
|
||||
]
|
||||
proxy_port = get_open_port()
|
||||
proxy_args = [
|
||||
"--host", "127.0.0.1", "--port",
|
||||
str(proxy_port), "--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}", "--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
|
||||
]
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
|
||||
with DisaggEpdProxy(proxy_args=proxy_args) as _:
|
||||
# warm up
|
||||
run_aisbench_cases(model=model,
|
||||
port=proxy_port,
|
||||
aisbench_cases=warmup_cases)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, proxy_port, aisbench_cases)
|
||||
Reference in New Issue
Block a user