[Test] Add initial multi modal cases of Qwen2.5-VL-7B-Instruct for disaggregated encoder (#5301)

### What this PR does / why we need it? This PR adds disaggregated encoder tests for Qwen2.5-VL-7B-Instruct ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? by running the test by running ci - vLLM version: release/v0.12.0 --------- Signed-off-by: wangyu31577 <wangyu31577@hundsun.com> Signed-off-by: wangyu <53896905+yenuo26@users.noreply.github.com> Co-authored-by: wangyu31577 <wangyu31577@hundsun.com>
2026-02-06 17:30:17 +08:00
parent 06c0aed124
commit c63b7a1188
8 changed files with 1361 additions and 1 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -18,6 +18,7 @@
 #

 import contextlib
+import copy
 import functools
 import gc
 import json
@@ -27,11 +28,15 @@ import os
 import shlex
 import subprocess
 import sys
+import threading
 import time
+import traceback
+from pathlib import Path
 from typing import Any, Optional, Tuple, TypeVar, Union

 import numpy as np
 import openai
+import psutil
 import pytest
 import requests
 import torch
@@ -80,6 +85,10 @@ logger = logging.getLogger(__name__)
 _TEST_DIR = os.path.dirname(__file__)
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]

+DISAGG_EPD_PROXY_SCRIPT = Path(
+    __file__
+).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
+

 def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
    import torch_npu  # type: ignore
@@ -441,6 +450,216 @@ class RemoteOpenAIServer:
                                  **kwargs)


+class RemoteEPDServer(RemoteOpenAIServer):
+    def _start_server(self, model: str, server_cmd: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        """Subclasses override this method to customize server process launch
+        """
+        raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
+
+    def __init__(self,
+                 vllm_serve_args: Union[list[str], list[list[str]]],
+                 server_host: str = '0.0.0.0',
+                 env_dict: Optional[dict[str, str]] = None,
+                 max_wait_seconds: Optional[float] = 2800) -> None:
+
+        self._proc_list = []
+
+        self.env_dict: dict[str, str] = {}
+        if env_dict is not None:
+            self.env_dict.update(env_dict)
+
+        self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1"
+        self.env_dict['VLLM_USE_V1'] = "1"
+        self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True"
+        self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+        self.vllm_serve_args_list = []
+        self.health_url_list = []
+        self.host = server_host
+
+        if isinstance(vllm_serve_args, list):
+            if not all(isinstance(item, list) for item in vllm_serve_args):
+                args_copy = copy.deepcopy(vllm_serve_args)
+                self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
+            else:
+                self.vllm_serve_args_list = [
+                    [str(arg) for arg in sublist]
+                    for sublist in copy.deepcopy(vllm_serve_args)
+                ]
+        else:
+            raise RuntimeError("vllm_serves_args must be a list")
+
+        serve_arg_cmd = ["vllm", "serve"]
+
+        for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
+            self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i)
+            if isinstance(vllm_serve_arg, list):
+                if "--port" not in vllm_serve_arg:
+                    raise ValueError("You have manually specified the port ")
+                else:
+                    port_arg = "--port"
+                    try:
+                        index = vllm_serve_arg.index(port_arg)
+                    except ValueError:
+                        raise ValueError(f"--port not found in args: {vllm_serve_arg}")
+                    port_str = vllm_serve_arg[index + 1]
+                    self.port = int(port_str)
+            else:
+                vllm_serve_arg_str = str(vllm_serve_arg)
+                if "--port" not in vllm_serve_arg_str:
+                    raise ValueError("You have manually specified the port ")
+                else:
+                    raise ValueError(f"Unexpected type for vllm_serve_arg: {type(vllm_serve_arg)}")
+
+            self.health_url_list.append(super().url_for("health"))
+            vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
+            proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict,
+                                                  f"[VLLM_{i}] ")
+            self._proc_list.append(proc)
+
+        timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
+        super()._wait_for_multiple_servers([(self.host, url)
+                                            for url in self.health_url_list],
+                                           timeout=timeout_value)
+
+    def _poll(self) -> Optional[int]:
+        return None
+
+    def _delete_shm(self) -> None:
+        for i, arg in enumerate(self.vllm_serve_args_list):
+            if "--ec-transfer-config" in arg:
+                index = arg.index("--ec-transfer-config")
+                config_str = arg[index + 1]
+                config_dict = json.loads(config_str)
+                ec_connector_extra_config = config_dict.get("ec_connector_extra_config", {})
+                shm_path = ec_connector_extra_config.get("shared_storage_path")
+                if shm_path:
+                    args = ["rm", "-r", "-f", str(shm_path)]
+                    print(f"delete shm_path is: {shm_path}")
+                    self._start_server_with_prefix(args, None, "[DELETE] ")
+
+    def _read_output(self, pipe, prefix):
+        try:
+            with pipe:
+                for line in iter(pipe.readline, ''):
+                    if line:
+                        print(f"{prefix}: {line}", end='')
+
+        except Exception as e:
+            print(f"error: {e}")
+            traceback.print_exc()
+
+    def _start_server_with_prefix(self, server_cmd: list[str],
+                      env_dict: Optional[dict[str, str]], log_prefix: str):
+        env = os.environ.copy()
+        if env_dict is not None:
+            env.update(env_dict)
+        proc = subprocess.Popen(server_cmd,
+                                env=env,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                universal_newlines=True,
+                                bufsize=1)
+        stdout_thread = threading.Thread(target=self._read_output,
+                                         args=(proc.stdout, log_prefix),
+                                         daemon=True)
+        stderr_thread = threading.Thread(target=self._read_output,
+                                         args=(proc.stderr, log_prefix),
+                                         daemon=True)
+
+        stdout_thread.start()
+        stderr_thread.start()
+        return proc
+
+    def _terminate_server(self) -> None:
+        """kill process and its children"""
+        print("vllm instance is stopping")
+        for proc in self._proc_list:
+            parent = psutil.Process(proc.pid)
+            children = parent.children(recursive=True)
+            for child in children:
+                try:
+                    child.terminate()
+                except psutil.NoSuchProcess:
+                    pass
+
+            gone, still_alive = psutil.wait_procs(children, timeout=10)
+
+            for child in still_alive:
+                try:
+                    child.kill()
+                except psutil.NoSuchProcess:
+                    pass
+
+            try:
+                parent.terminate()
+                parent.wait(timeout=10)
+            except (psutil.NoSuchProcess, psutil.TimeoutExpired):
+                try:
+                    parent.kill()
+                except psutil.NoSuchProcess:
+                    pass
+
+    def __enter__(self):
+        """Context manager entry point."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit point - clean up all processes."""
+        self._terminate_server()
+
+
+class DisaggEpdProxy(RemoteEPDServer):
+
+    def __init__(self,
+                 proxy_args: Optional[Union[list[str], str]] = None,
+                 env_dict: Optional[dict[str, str]] = None,
+                 server_host: str = '0.0.0.0',
+                 max_wait_seconds: Optional[float] = 2800) -> None:
+
+        if proxy_args is None:
+            proxy_args_list: list[str] = []
+        elif isinstance(proxy_args, str):
+            proxy_args_list = shlex.split(proxy_args)
+        else:
+            proxy_args_list = proxy_args
+
+        self.proxy_args = proxy_args_list
+        self.env_dict: dict[str, str] = {}
+        if env_dict is not None:
+            self.env_dict.update(env_dict)
+        self._proc_list = list()
+        self.host = server_host
+
+        print(f"proxy param is: {self.proxy_args}")
+        proxy_cmd = ["python", str(DISAGG_EPD_PROXY_SCRIPT), *self.proxy_args]
+        proc = self._start_server_with_prefix(proxy_cmd, self.env_dict, "[PROXY] ")
+        self._proc_list.append(proc)
+
+        if "--port" not in self.proxy_args:
+            raise ValueError("You have manually specified the port ")
+        else:
+            try:
+                index = self.proxy_args.index("--port")
+            except ValueError:
+                raise ValueError("--port not found in proxy args")
+            port_str = self.proxy_args[index + 1]
+            self.port = int(port_str)
+
+        timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
+        super()._wait_for_multiple_servers(
+            [(self.host, super().url_for("health"))], timeout=timeout_value)
+
+    def __enter__(self):
+        """Context manager entry point."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit point - clean up all processes."""
+        super()._terminate_server()
+
+
 class VllmRunner:

    def __init__(
--- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
+++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import pytest
+from vllm.utils.network_utils import get_open_port
+
+from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
+from tools.send_mm_request import send_image_request
+
+MODELS = [
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+]
+SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
+TENSOR_PARALLELS = [1]
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
+async def test_models(model: str, tp_size: int) -> None:
+    encode_port = get_open_port()
+    pd_port = get_open_port()
+    vllm_server_args = [
+        [
+            "--port",
+            str(encode_port), "--model", model, "--gpu-memory-utilization",
+            "0.01", "--tensor-parallel-size",
+            str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
+            "--max-model-len", "10000", "--max-num-batched-tokens", "10000",
+            "--max-num-seqs", "1", "--ec-transfer-config",
+            '{"ec_connector_extra_config":{"shared_storage_path":"' +
+            SHARED_STORAGE_PATH +
+            '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
+        ],
+        [
+            "--port",
+            str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
+            "--tensor-parallel-size",
+            str(tp_size), "--enforce-eager", "--max-model-len", "10000",
+            "--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
+            "--ec-transfer-config",
+            '{"ec_connector_extra_config":{"shared_storage_path":"' +
+            SHARED_STORAGE_PATH +
+            '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
+        ]
+    ]
+    proxy_port = get_open_port()
+    proxy_args = [
+        "--host", "127.0.0.1", "--port",
+        str(proxy_port), "--encode-servers-urls",
+        f"http://localhost:{encode_port}", "--decode-servers-urls",
+        f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
+    ]
+
+    with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
+        with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
+            send_image_request(model, proxy)
+
--- a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py
+++ b/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import pytest
+from vllm.utils.network_utils import get_open_port
+
+from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
+from tools.aisbench import run_aisbench_cases
+
+MODELS = [
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+]
+SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
+TENSOR_PARALLELS = [1]
+
+warmup_cases = [{
+    "case_type": "performance",
+    "dataset_path": "vllm-ascend/textvqa-perf-1080p",
+    "request_conf": "vllm_api_stream_chat",
+    "dataset_conf": "textvqa/textvqa_gen_base64",
+    "num_prompts": 50,
+    "max_out_len": 20,
+    "batch_size": 32,
+    "request_rate": 0,
+    "baseline": 1,
+    "threshold": 0.97
+}]
+aisbench_cases = [{
+    "case_type": "accuracy",
+    "dataset_path": "vllm-ascend/textvqa-lite",
+    "request_conf": "vllm_api_stream_chat",
+    "dataset_conf": "textvqa/textvqa_gen_base64",
+    "max_out_len": 2048,
+    "batch_size": 128,
+    "baseline": 82.05,
+    "threshold": 5
+}, {
+    "case_type": "performance",
+    "dataset_path": "vllm-ascend/textvqa-perf-1080p",
+    "request_conf": "vllm_api_stream_chat",
+    "dataset_conf": "textvqa/textvqa_gen_base64",
+    "num_prompts": 512,
+    "max_out_len": 256,
+    "batch_size": 128,
+    "request_rate": 0,
+    "baseline": 1,
+    "threshold": 0.97
+}]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
+async def test_models(model: str, tp_size: int) -> None:
+    encode_port = get_open_port()
+    pd_port = get_open_port()
+    vllm_server_args = [
+        [
+            "--port",
+            str(encode_port), "--model", model, "--gpu-memory-utilization",
+            "0.01", "--tensor-parallel-size",
+            str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
+            "--max-model-len", "10000", "--max-num-batched-tokens", "10000",
+            "--max-num-seqs", "1", "--ec-transfer-config",
+            '{"ec_connector_extra_config":{"shared_storage_path":"' +
+            SHARED_STORAGE_PATH +
+            '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
+        ],
+        [
+            "--port",
+            str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
+            "--tensor-parallel-size",
+            str(tp_size), "--enforce-eager", "--max-model-len", "10000",
+            "--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
+            "--ec-transfer-config",
+            '{"ec_connector_extra_config":{"shared_storage_path":"' +
+            SHARED_STORAGE_PATH +
+            '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
+        ]
+    ]
+    proxy_port = get_open_port()
+    proxy_args = [
+        "--host", "127.0.0.1", "--port",
+        str(proxy_port), "--encode-servers-urls",
+        f"http://localhost:{encode_port}", "--decode-servers-urls",
+        f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
+    ]
+
+    with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
+        with DisaggEpdProxy(proxy_args=proxy_args) as _:
+            # warm up
+            run_aisbench_cases(model=model,
+                               port=proxy_port,
+                               aisbench_cases=warmup_cases)
+            # aisbench test
+            run_aisbench_cases(model, proxy_port, aisbench_cases)