[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
SILONG ZENG
2026-03-10 09:52:50 +08:00
committed by GitHub
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions

View File

@@ -51,35 +51,26 @@ line-length = 120
# Folder to be modified # Folder to be modified
exclude = [ exclude = [
# Batch (1) # Batch (1)
"tests/e2e/__init__.py", # "tests/e2e/__init__.py",
"tests/e2e/310p/", # "tests/e2e/310p/",
"tests/e2e/conftest.py", # "tests/e2e/conftest.py",
"tests/e2e/doctests/", # "tests/e2e/doctests/",
"tests/e2e/model_utils.py", # "tests/e2e/model_utils.py",
"tests/e2e/models/", # "tests/e2e/models/",
"tests/e2e/multicard/2-cards/", # "tests/e2e/multicard/2-cards/",
# Batch (2) # Batch (2)
"tests/e2e/multicard/4-cards/", "tests/e2e/multicard/4-cards/",
"tests/e2e/nightly/multi_node/", "tests/e2e/nightly/multi_node/",
# Batch (3)
"tests/e2e/nightly/single_node/models/",
# Batch (4)
"tests/e2e/nightly/single_node/ops/",
# Batch (5)
# "tests/e2e/singlecard/",
# Batch (6)
"tests/e2e/nightly/single_node/ops/singlecard_ops/triton/",
"tests/e2e/singlecard/pooling/", "tests/e2e/singlecard/pooling/",
"tests/e2e/singlecard/spec_decode/", "tests/e2e/singlecard/spec_decode/",
"tests/e2e/utils.py", "tests/e2e/utils.py",
"tests/e2e/vllm_interface/", "tests/e2e/vllm_interface/",
"tests/e2e/weekly/", "tests/e2e/weekly/",
# Batch (3)
"tests/e2e/nightly/single_node/",
"tests/ut/", "tests/ut/",
] ]

View File

@@ -15,28 +15,23 @@
# limitations under the License. # limitations under the License.
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
import sys
import os import os
import sys
# Add 310p directory to sys.path # Add 310p directory to sys.path
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir) # 310p directory parent_dir = os.path.dirname(current_dir) # 310p directory
sys.path.insert(0, parent_dir) sys.path.insert(0, parent_dir)
# ruff: noqa: E402
from test_utils import run_vl_model_test from test_utils import run_vl_model_test
def test_qwen3_vl_8b_tp2_fp16(): def test_qwen3_vl_8b_tp2_fp16():
"""Qwen3-VL-8B dual-card FP16 test""" """Qwen3-VL-8B dual-card FP16 test"""
run_vl_model_test( run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=2, max_tokens=5)
model_name="Qwen/Qwen3-VL-8B-Instruct",
tensor_parallel_size=2,
max_tokens=5
)
def test_qwen3_vl_32b_tp1_fp16(): def test_qwen3_vl_32b_tp1_fp16():
"""Qwen3-VL-32B 4-card FP16 test""" """Qwen3-VL-32B 4-card FP16 test"""
run_vl_model_test( run_vl_model_test(model_name="Qwen/Qwen3-VL-32B-Instruct", tensor_parallel_size=4, max_tokens=5)
model_name="Qwen/Qwen3-VL-32B-Instruct",
tensor_parallel_size=4,
max_tokens=5
)

View File

@@ -15,20 +15,18 @@
# limitations under the License. # limitations under the License.
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
import sys
import os import os
import sys
# Add 310p directory to sys.path # Add 310p directory to sys.path
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir) # 310p directory parent_dir = os.path.dirname(current_dir) # 310p directory
sys.path.insert(0, parent_dir) sys.path.insert(0, parent_dir)
# ruff: noqa: E402
from test_utils import run_vl_model_test from test_utils import run_vl_model_test
def test_qwen3_vl_8b_tp1_fp16(): def test_qwen3_vl_8b_tp1_fp16():
"""Qwen3-VL-8B single-card FP16 test""" """Qwen3-VL-8B single-card FP16 test"""
run_vl_model_test( run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=1, max_tokens=5)
model_name="Qwen/Qwen3-VL-8B-Instruct",
tensor_parallel_size=1,
max_tokens=5
)

View File

@@ -15,10 +15,12 @@
# limitations under the License. # limitations under the License.
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
from tests.e2e.conftest import VllmRunner
from PIL import Image
import os import os
from PIL import Image
from tests.e2e.conftest import VllmRunner
def get_test_image(): def get_test_image():
"""Get the image object for testing""" """Get the image object for testing"""
@@ -32,11 +34,9 @@ def get_test_prompts():
return ["<|image_pad|>Describe this image in detail."] return ["<|image_pad|>Describe this image in detail."]
def run_vl_model_test(model_name: str, def run_vl_model_test(
tensor_parallel_size: int, model_name: str, tensor_parallel_size: int, max_tokens: int, dtype: str = "float16", enforce_eager: bool = True
max_tokens: int, ):
dtype: str = "float16",
enforce_eager: bool = True):
""" """
Generic visual language model test function Generic visual language model test function
@@ -52,9 +52,6 @@ def run_vl_model_test(model_name: str,
prompts = get_test_prompts() prompts = get_test_prompts()
with VllmRunner( with VllmRunner(
model_name, model_name, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, dtype=dtype
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
dtype=dtype
) as vllm_model: ) as vllm_model:
vllm_model.generate_greedy(prompts, max_tokens, images=images) vllm_model.generate_greedy(prompts, max_tokens, images=images)

View File

@@ -32,7 +32,7 @@ import threading
import time import time
import traceback import traceback
from pathlib import Path from pathlib import Path
from typing import Any, Optional, Tuple, TypeVar, Union from typing import Any, TypeVar
import numpy as np import numpy as np
import openai import openai
@@ -44,23 +44,20 @@ from modelscope import snapshot_download # type: ignore[import-untyped]
from PIL import Image from PIL import Image
from requests.exceptions import RequestException from requests.exceptions import RequestException
from torch import nn from torch import nn
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature
BatchEncoding, BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config.model import (ConvertOption, RunnerOption, from vllm.config.model import ConvertOption, RunnerOption, _get_and_verify_dtype
_get_and_verify_dtype)
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.utils import maybe_model_redirect from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
from tests.e2e.model_utils import (TokensTextLogprobs, from tests.e2e.model_utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
TokensTextLogprobsPromptLogprobs) from tests.e2e.nightly.multi_node.scripts.multi_node_config import DisaggregatedPrefillCfg, NodeInfo
from tests.e2e.nightly.multi_node.scripts.multi_node_config import (
DisaggregatedPrefillCfg, NodeInfo)
from vllm_ascend.ascend_config import clear_ascend_config from vllm_ascend.ascend_config import clear_ascend_config
# TODO: remove this part after the patch merged into vllm, if # TODO: remove this part after the patch merged into vllm, if
# we not explicitly patch here, some of them might be effectiveless # we not explicitly patch here, some of them might be effectiveless
# in pytest scenario # in pytest scenario
@@ -70,29 +67,29 @@ adapt_patch(True)
adapt_patch(False) adapt_patch(False)
from vllm.distributed.parallel_state import ( # noqa E402 from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel) destroy_distributed_environment,
destroy_model_parallel,
)
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
_M = TypeVar("_M") _M = TypeVar("_M")
_PromptMultiModalInput = Union[list[_M], list[list[_M]]] _PromptMultiModalInput = list[_M] | list[list[_M]]
PromptImageInput = _PromptMultiModalInput[Image.Image] PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray] PromptVideoInput = _PromptMultiModalInput[np.ndarray]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TEST_DIR = os.path.dirname(__file__) _TEST_DIR = os.path.dirname(__file__)
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]
DISAGG_EPD_PROXY_SCRIPT = Path( DISAGG_EPD_PROXY_SCRIPT = (
__file__ Path(__file__).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py" )
def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float): def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
import torch_npu # type: ignore
# We can try to clean up memory in this subprocess, though it mostly affects this process. # We can try to clean up memory in this subprocess, though it mostly affects this process.
# But if there are any lingering contexts in this process (unlikely for a fresh spawn), it helps. # But if there are any lingering contexts in this process (unlikely for a fresh spawn), it helps.
gc.collect() gc.collect()
@@ -104,7 +101,7 @@ def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: fl
while True: while True:
free_bytes, _ = torch.npu.mem_get_info() free_bytes, _ = torch.npu.mem_get_info()
if free_bytes / total_npu_memory >= target_free_percentage: if free_bytes / total_npu_memory >= target_free_percentage:
print(f'check_npu_memory_worker: npu free memory decreased target value.') print("check_npu_memory_worker: npu free memory decreased target value.")
return # Success return # Success
elapsed = time.time() - start_time elapsed = time.time() - start_time
@@ -113,7 +110,7 @@ def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: fl
print( print(
f"Timeout: NPU memory free size did not reach " f"Timeout: NPU memory free size did not reach "
f"{target_free_percentage} of total npu memory within {max_wait_seconds} seconds.", f"{target_free_percentage} of total npu memory within {max_wait_seconds} seconds.",
file=sys.stderr file=sys.stderr,
) )
sys.exit(1) # Failure sys.exit(1) # Failure
@@ -135,6 +132,7 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
target_free_percentage (float): Target free memory percentage of total. target_free_percentage (float): Target free memory percentage of total.
max_wait_seconds (float): Maximum wait time in seconds. max_wait_seconds (float): Maximum wait time in seconds.
""" """
def decorator(func): def decorator(func):
@functools.wraps(func) @functools.wraps(func)
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
@@ -143,10 +141,7 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
# Use a spawned subprocess to check NPU memory to avoid initializing NPU in the main process # Use a spawned subprocess to check NPU memory to avoid initializing NPU in the main process
ctx = multiprocessing.get_context("spawn") ctx = multiprocessing.get_context("spawn")
p = ctx.Process( p = ctx.Process(target=_check_npu_memory_worker, args=(target_free_percentage, max_wait_seconds))
target=_check_npu_memory_worker,
args=(target_free_percentage, max_wait_seconds)
)
p.start() p.start()
p.join() p.join()
@@ -157,7 +152,9 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
) )
return func(*args, **kwargs) return func(*args, **kwargs)
return wrapper return wrapper
return decorator return decorator
@@ -168,6 +165,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()
if shutdown_ray: if shutdown_ray:
import ray # Lazy import Ray import ray # Lazy import Ray
ray.shutdown() ray.shutdown()
gc.collect() gc.collect()
@@ -180,7 +178,6 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
class MooncakeLauncher: class MooncakeLauncher:
def __init__( def __init__(
self, self,
mooncake_port, mooncake_port,
@@ -228,14 +225,12 @@ class MooncakeLauncher:
class RemoteOpenAIServer: class RemoteOpenAIServer:
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
def _start_server(self, model: str, server_cmd: list[str], def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
env_dict: Optional[dict[str, str]]) -> None: """Subclasses override this method to customize server process launch"""
"""Subclasses override this method to customize server process launch
"""
env = os.environ.copy() env = os.environ.copy()
# the current process might initialize npu, # the current process might initialize npu,
# to be safe, we should use spawn method # to be safe, we should use spawn method
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if env_dict is not None: if env_dict is not None:
env.update(env_dict) env.update(env_dict)
logger.info(f"Starting server with command: {' '.join(server_cmd)}") logger.info(f"Starting server with command: {' '.join(server_cmd)}")
@@ -249,45 +244,39 @@ class RemoteOpenAIServer:
def __init__( def __init__(
self, self,
model: str, model: str,
vllm_serve_args: Union[list[str], str], vllm_serve_args: list[str] | str,
*, *,
server_host: str = '0.0.0.0', server_host: str = "0.0.0.0",
server_port: int = 8080, server_port: int = 8080,
env_dict: Optional[dict[str, str]] = None, env_dict: dict[str, str] | None = None,
seed: Optional[int] = None, seed: int | None = None,
auto_port: bool = True, auto_port: bool = True,
nodes_info: Optional[list[NodeInfo]] = None, nodes_info: list[NodeInfo] | None = None,
disaggregated_prefill: Optional[DisaggregatedPrefillCfg] = None, disaggregated_prefill: DisaggregatedPrefillCfg | None = None,
proxy_port: Optional[int] = None, proxy_port: int | None = None,
max_wait_seconds: Optional[float] = None, max_wait_seconds: float | None = None,
override_hf_configs: Optional[dict[str, Any]] = None) -> None: override_hf_configs: dict[str, Any] | None = None,
) -> None:
if isinstance(vllm_serve_args, str): if isinstance(vllm_serve_args, str):
vllm_serve_args = shlex.split(vllm_serve_args) vllm_serve_args = shlex.split(vllm_serve_args)
else: else:
vllm_serve_args = ["vllm", "serve", model, *vllm_serve_args] vllm_serve_args = ["vllm", "serve", model, *vllm_serve_args]
if auto_port: if auto_port:
if "-p" in vllm_serve_args or "--port" in vllm_serve_args: if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
raise ValueError("You have manually specified the port " raise ValueError("You have manually specified the port when `auto_port=True`.")
"when `auto_port=True`.")
# No need for a port if using unix sockets # No need for a port if using unix sockets
if "--uds" not in vllm_serve_args: if "--uds" not in vllm_serve_args:
# Don't mutate the input args # Don't mutate the input args
vllm_serve_args = vllm_serve_args + [ vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
"--port", str(get_open_port())
]
if seed is not None: if seed is not None:
if "--seed" in vllm_serve_args: if "--seed" in vllm_serve_args:
raise ValueError("You have manually specified the seed " raise ValueError(f"You have manually specified the seed when `seed={seed}`.")
f"when `seed={seed}`.")
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)] vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
if override_hf_configs is not None: if override_hf_configs is not None:
vllm_serve_args = vllm_serve_args + [ vllm_serve_args = vllm_serve_args + ["--hf-overrides", json.dumps(override_hf_configs)]
"--hf-overrides",
json.dumps(override_hf_configs)
]
self.host = str(server_host) self.host = str(server_host)
self.port = int(server_port) self.port = int(server_port)
@@ -303,9 +292,7 @@ class RemoteOpenAIServer:
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided" assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
self._wait_for_server_pd(timeout=max_wait_seconds) self._wait_for_server_pd(timeout=max_wait_seconds)
else: else:
self._wait_for_multiple_servers( self._wait_for_multiple_servers([(self.host, self.url_for("health"))], timeout=max_wait_seconds)
[(self.host, self.url_for("health"))],
timeout=max_wait_seconds)
def __enter__(self): def __enter__(self):
return self return self
@@ -313,7 +300,7 @@ class RemoteOpenAIServer:
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
self._terminate_server() self._terminate_server()
def _poll(self) -> Optional[int]: def _poll(self) -> int | None:
"""Subclasses override this method to customize process polling""" """Subclasses override this method to customize process polling"""
return self.proc.poll() return self.proc.poll()
@@ -345,24 +332,23 @@ class RemoteOpenAIServer:
def url_health(ip: str, port: int) -> str: def url_health(ip: str, port: int) -> str:
return f"http://{ip}:{port}/health" return f"http://{ip}:{port}/health"
targets = [(node_info.ip, url_health(node_info.ip, self.port)) targets = [
for node_info in self.nodes_info if not node_info.headless] (node_info.ip, url_health(node_info.ip, self.port))
for node_info in self.nodes_info
if not node_info.headless
]
# Wait for proxy ready # Wait for proxy ready
master_node = self.nodes_info[0] master_node = self.nodes_info[0]
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck" url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
# Wait for master node proxy first # Wait for master node proxy first
self._wait_for_multiple_servers([(master_node.ip, url_proxy)], self._wait_for_multiple_servers([(master_node.ip, url_proxy)], timeout=timeout)
timeout=timeout)
# Then wait for all api_server nodes # Then wait for all api_server nodes
self._wait_for_multiple_servers(targets=targets, timeout=timeout) self._wait_for_multiple_servers(targets=targets, timeout=timeout)
def _wait_for_multiple_servers(self, def _wait_for_multiple_servers(self, targets, timeout: float, log_interval: float = 30.0):
targets,
timeout: float,
log_interval: float = 30.0):
""" """
targets: List[(node_ip, url)] targets: List[(node_ip, url)]
log_interval log_interval
@@ -396,9 +382,7 @@ class RemoteOpenAIServer:
# check unexpected exit # check unexpected exit
result = self._poll() result = self._poll()
if result is not None and result != 0: if result is not None and result != 0:
raise RuntimeError( raise RuntimeError(f"Server at {node_ip} exited unexpectedly.") from None
f"Server at {node_ip} exited unexpectedly."
) from None
if should_log: if should_log:
last_log_time = now last_log_time = now
@@ -444,35 +428,31 @@ class RemoteOpenAIServer:
def get_async_client(self, **kwargs): def get_async_client(self, **kwargs):
if "timeout" not in kwargs: if "timeout" not in kwargs:
kwargs["timeout"] = 600 kwargs["timeout"] = 600
return openai.AsyncOpenAI(base_url=self.url_for("v1"), return openai.AsyncOpenAI(base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs)
api_key=self.DUMMY_API_KEY,
max_retries=0,
**kwargs)
class RemoteEPDServer(RemoteOpenAIServer): class RemoteEPDServer(RemoteOpenAIServer):
def _start_server(self, model: str, server_cmd: list[str], def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
env_dict: Optional[dict[str, str]]) -> None: """Subclasses override this method to customize server process launch"""
"""Subclasses override this method to customize server process launch
"""
raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead") raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
def __init__(self, def __init__(
vllm_serve_args: Union[list[str], list[list[str]]], self,
server_host: str = '0.0.0.0', vllm_serve_args: list[str] | list[list[str]],
env_dict: Optional[dict[str, str]] = None, server_host: str = "0.0.0.0",
max_wait_seconds: Optional[float] = 2800) -> None: env_dict: dict[str, str] | None = None,
max_wait_seconds: float | None = 2800,
) -> None:
self._proc_list = [] self._proc_list = []
self.env_dict: dict[str, str] = {} self.env_dict: dict[str, str] = {}
if env_dict is not None: if env_dict is not None:
self.env_dict.update(env_dict) self.env_dict.update(env_dict)
self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1" self.env_dict["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
self.env_dict['VLLM_USE_V1'] = "1" self.env_dict["VLLM_USE_V1"] = "1"
self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True" self.env_dict["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' self.env_dict["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
self.vllm_serve_args_list = [] self.vllm_serve_args_list = []
self.health_url_list = [] self.health_url_list = []
@@ -484,8 +464,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
self.vllm_serve_args_list.append([str(arg) for arg in args_copy]) self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
else: else:
self.vllm_serve_args_list = [ self.vllm_serve_args_list = [
[str(arg) for arg in sublist] [str(arg) for arg in sublist] for sublist in copy.deepcopy(vllm_serve_args)
for sublist in copy.deepcopy(vllm_serve_args)
] ]
else: else:
raise RuntimeError("vllm_serves_args must be a list") raise RuntimeError("vllm_serves_args must be a list")
@@ -493,7 +472,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
serve_arg_cmd = ["vllm", "serve"] serve_arg_cmd = ["vllm", "serve"]
for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list): for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i) self.env_dict["ASCEND_RT_VISIBLE_DEVICES"] = str(i)
if isinstance(vllm_serve_arg, list): if isinstance(vllm_serve_arg, list):
if "--port" not in vllm_serve_arg: if "--port" not in vllm_serve_arg:
raise ValueError("You have manually specified the port ") raise ValueError("You have manually specified the port ")
@@ -514,16 +493,13 @@ class RemoteEPDServer(RemoteOpenAIServer):
self.health_url_list.append(super().url_for("health")) self.health_url_list.append(super().url_for("health"))
vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg] vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict, proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict, f"[VLLM_{i}] ")
f"[VLLM_{i}] ")
self._proc_list.append(proc) self._proc_list.append(proc)
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0 timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
super()._wait_for_multiple_servers([(self.host, url) super()._wait_for_multiple_servers([(self.host, url) for url in self.health_url_list], timeout=timeout_value)
for url in self.health_url_list],
timeout=timeout_value)
def _poll(self) -> Optional[int]: def _poll(self) -> int | None:
return None return None
def _delete_shm(self) -> None: def _delete_shm(self) -> None:
@@ -542,31 +518,23 @@ class RemoteEPDServer(RemoteOpenAIServer):
def _read_output(self, pipe, prefix): def _read_output(self, pipe, prefix):
try: try:
with pipe: with pipe:
for line in iter(pipe.readline, ''): for line in iter(pipe.readline, ""):
if line: if line:
print(f"{prefix}: {line}", end='') print(f"{prefix}: {line}", end="")
except Exception as e: except Exception as e:
print(f"error: {e}") print(f"error: {e}")
traceback.print_exc() traceback.print_exc()
def _start_server_with_prefix(self, server_cmd: list[str], def _start_server_with_prefix(self, server_cmd: list[str], env_dict: dict[str, str] | None, log_prefix: str):
env_dict: Optional[dict[str, str]], log_prefix: str):
env = os.environ.copy() env = os.environ.copy()
if env_dict is not None: if env_dict is not None:
env.update(env_dict) env.update(env_dict)
proc = subprocess.Popen(server_cmd, proc = subprocess.Popen(
env=env, server_cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, bufsize=1
stdout=subprocess.PIPE, )
stderr=subprocess.PIPE, stdout_thread = threading.Thread(target=self._read_output, args=(proc.stdout, log_prefix), daemon=True)
universal_newlines=True, stderr_thread = threading.Thread(target=self._read_output, args=(proc.stderr, log_prefix), daemon=True)
bufsize=1)
stdout_thread = threading.Thread(target=self._read_output,
args=(proc.stdout, log_prefix),
daemon=True)
stderr_thread = threading.Thread(target=self._read_output,
args=(proc.stderr, log_prefix),
daemon=True)
stdout_thread.start() stdout_thread.start()
stderr_thread.start() stderr_thread.start()
@@ -579,27 +547,21 @@ class RemoteEPDServer(RemoteOpenAIServer):
parent = psutil.Process(proc.pid) parent = psutil.Process(proc.pid)
children = parent.children(recursive=True) children = parent.children(recursive=True)
for child in children: for child in children:
try: with contextlib.suppress(psutil.NoSuchProcess):
child.terminate() child.terminate()
except psutil.NoSuchProcess:
pass
gone, still_alive = psutil.wait_procs(children, timeout=10) gone, still_alive = psutil.wait_procs(children, timeout=10)
for child in still_alive: for child in still_alive:
try: with contextlib.suppress(psutil.NoSuchProcess):
child.kill() child.kill()
except psutil.NoSuchProcess:
pass
try: try:
parent.terminate() parent.terminate()
parent.wait(timeout=10) parent.wait(timeout=10)
except (psutil.NoSuchProcess, psutil.TimeoutExpired): except (psutil.NoSuchProcess, psutil.TimeoutExpired):
try: with contextlib.suppress(psutil.NoSuchProcess):
parent.kill() parent.kill()
except psutil.NoSuchProcess:
pass
def __enter__(self): def __enter__(self):
"""Context manager entry point.""" """Context manager entry point."""
@@ -611,13 +573,13 @@ class RemoteEPDServer(RemoteOpenAIServer):
class DisaggEpdProxy(RemoteEPDServer): class DisaggEpdProxy(RemoteEPDServer):
def __init__(
def __init__(self, self,
proxy_args: Optional[Union[list[str], str]] = None, proxy_args: list[str] | str | None = None,
env_dict: Optional[dict[str, str]] = None, env_dict: dict[str, str] | None = None,
server_host: str = '0.0.0.0', server_host: str = "0.0.0.0",
max_wait_seconds: Optional[float] = 2800) -> None: max_wait_seconds: float | None = 2800,
) -> None:
if proxy_args is None: if proxy_args is None:
proxy_args_list: list[str] = [] proxy_args_list: list[str] = []
elif isinstance(proxy_args, str): elif isinstance(proxy_args, str):
@@ -648,8 +610,7 @@ class DisaggEpdProxy(RemoteEPDServer):
self.port = int(port_str) self.port = int(port_str)
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0 timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
super()._wait_for_multiple_servers( super()._wait_for_multiple_servers([(self.host, super().url_for("health"))], timeout=timeout_value)
[(self.host, super().url_for("health"))], timeout=timeout_value)
def __enter__(self): def __enter__(self):
"""Context manager entry point.""" """Context manager entry point."""
@@ -661,23 +622,22 @@ class DisaggEpdProxy(RemoteEPDServer):
class VllmRunner: class VllmRunner:
def __init__( def __init__(
self, self,
model_name: str, model_name: str,
runner: RunnerOption = "auto", runner: RunnerOption = "auto",
convert: ConvertOption = "auto", convert: ConvertOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: str | None = None,
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
max_model_len: Optional[int] = 1024, max_model_len: int | None = 1024,
dtype: str = "auto", dtype: str = "auto",
disable_log_stats: bool = True, disable_log_stats: bool = True,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16, block_size: int = 16,
enable_chunked_prefill: bool = True, enable_chunked_prefill: bool = True,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: bool | None = False,
quantization: Optional[str] = None, quantization: str | None = None,
**kwargs, **kwargs,
) -> None: ) -> None:
self.model = LLM( self.model = LLM(
@@ -701,17 +661,13 @@ class VllmRunner:
def get_inputs( def get_inputs(
self, self,
prompts: Union[list[str], list[torch.Tensor], list[int]], prompts: list[str] | list[torch.Tensor] | list[int],
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
) -> list[TextPrompt]: ) -> list[TextPrompt]:
if any(x is not None and len(x) != len(prompts) for x in [images, videos, audios]):
if any(x is not None and len(x) != len(prompts) raise ValueError("All non-None multimodal inputs must have the same length as prompts")
for x in [images, videos, audios]):
raise ValueError(
"All non-None multimodal inputs must have the same length as "
"prompts")
inputs = [] inputs = []
for i, prompt in enumerate(prompts): for i, prompt in enumerate(prompts):
@@ -723,9 +679,7 @@ class VllmRunner:
if audios is not None and (audio := audios[i]) is not None: if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio # type: ignore multi_modal_data["audio"] = audio # type: ignore
text_prompt_kwargs: dict[str, Any] = { text_prompt_kwargs: dict[str, Any] = {"multi_modal_data": multi_modal_data or None}
"multi_modal_data": multi_modal_data or None
}
if isinstance(prompt, str): if isinstance(prompt, str):
text_prompt_kwargs["prompt"] = prompt text_prompt_kwargs["prompt"] = prompt
elif isinstance(prompt, list): elif isinstance(prompt, list):
@@ -739,21 +693,16 @@ class VllmRunner:
def generate( def generate(
self, self,
prompts: Union[list[str], list[torch.Tensor]], prompts: list[str] | list[torch.Tensor],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
**kwargs: Any, **kwargs: Any,
) -> list[tuple[list[list[int]], list[str]]]: ) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts, inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
images=images,
videos=videos,
audios=audios)
req_outputs = self.model.generate(inputs, req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)
sampling_params=sampling_params,
**kwargs)
outputs: list[tuple[list[list[int]], list[str]]] = [] outputs: list[tuple[list[list[int]], list[str]]] = []
for req_output in req_outputs: for req_output in req_outputs:
@@ -780,99 +729,83 @@ class VllmRunner:
output_str = sample.text output_str = sample.text
output_ids = list(sample.token_ids) output_ids = list(sample.token_ids)
output_logprobs = sample.logprobs output_logprobs = sample.logprobs
outputs.append((output_ids, output_str, output_logprobs, outputs.append((output_ids, output_str, output_logprobs, req_output.prompt_logprobs))
req_output.prompt_logprobs))
return outputs return outputs
def generate_w_logprobs( def generate_w_logprobs(
self, self,
prompts: list[str], prompts: list[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
**kwargs: Any, **kwargs: Any,
) -> Union[list[TokensTextLogprobs], ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
list[TokensTextLogprobsPromptLogprobs]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
req_outputs = self.model.generate(inputs, req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)
sampling_params=sampling_params,
**kwargs)
toks_str_logsprobs_prompt_logprobs = ( toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(req_outputs)
self._final_steps_generate_w_logprobs(req_outputs))
# Omit prompt logprobs if not required by sampling params # Omit prompt logprobs if not required by sampling params
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] return (
if sampling_params.prompt_logprobs is None else [x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
toks_str_logsprobs_prompt_logprobs) if sampling_params.prompt_logprobs is None
else toks_str_logsprobs_prompt_logprobs
)
def generate_greedy( def generate_greedy(
self, self,
prompts: Union[list[str], list[torch.Tensor]], prompts: list[str] | list[torch.Tensor],
max_tokens: int, max_tokens: int,
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
**kwargs: Any, **kwargs: Any,
) -> list[tuple[list[int], str]]: ) -> list[tuple[list[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, outputs = self.generate(prompts, greedy_params, images=images, videos=videos, audios=audios, **kwargs)
greedy_params, return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
images=images,
videos=videos,
audios=audios,
**kwargs)
return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs]
def generate_greedy_logprobs( def generate_greedy_logprobs(
self, self,
prompts: list[str], prompts: list[str],
max_tokens: int, max_tokens: int,
num_logprobs: Optional[int], num_logprobs: int | None,
num_prompt_logprobs: Optional[int] = None, num_prompt_logprobs: int | None = None,
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
stop_token_ids: Optional[list[int]] = None, stop_token_ids: list[int] | None = None,
stop: Optional[list[str]] = None, stop: list[str] | None = None,
**kwargs: Any, **kwargs: Any,
) -> Union[list[TokensTextLogprobs], ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
list[TokensTextLogprobsPromptLogprobs]]:
greedy_logprobs_params = SamplingParams( greedy_logprobs_params = SamplingParams(
temperature=0.0, temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=num_logprobs, logprobs=num_logprobs,
prompt_logprobs=num_prompt_logprobs, prompt_logprobs=num_prompt_logprobs,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
stop=stop) stop=stop,
)
return self.generate_w_logprobs(prompts, return self.generate_w_logprobs(
greedy_logprobs_params, prompts, greedy_logprobs_params, images=images, audios=audios, videos=videos, **kwargs
images=images, )
audios=audios,
videos=videos,
**kwargs)
def classify(self, prompts: list[str]) -> list[list[float]]: def classify(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.classify(prompts) req_outputs = self.model.classify(prompts)
return [req_output.outputs.probs for req_output in req_outputs] return [req_output.outputs.probs for req_output in req_outputs]
def embed(self, def embed(
self,
prompts: list[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
*args, *args,
**kwargs) -> list[list[float]]: **kwargs,
inputs = self.get_inputs(prompts, ) -> list[list[float]]:
images=images, inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
videos=videos,
audios=audios)
req_outputs = self.model.embed(inputs, *args, **kwargs) req_outputs = self.model.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs] return [req_output.outputs.embedding for req_output in req_outputs]
@@ -887,8 +820,8 @@ class VllmRunner:
def score( def score(
self, self,
text_1: Union[str, list[str]], text_1: str | list[str],
text_2: Union[str, list[str]], text_2: str | list[str],
*args, *args,
**kwargs, **kwargs,
) -> list[float]: ) -> list[float]:
@@ -905,14 +838,11 @@ class VllmRunner:
class HfRunner: class HfRunner:
def get_default_device(self): def get_default_device(self):
return "cpu" if current_platform.is_cpu() else current_platform.device_type
return ("cpu" def wrap_device(self, x: _T, device: str | None = None) -> _T:
if current_platform.is_cpu() else current_platform.device_type) if x is None or isinstance(x, (bool,)):
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
if x is None or isinstance(x, (bool, )):
return x return x
if device is None: if device is None:
@@ -931,7 +861,7 @@ class HfRunner:
model_name: str, model_name: str,
dtype: str = "auto", dtype: str = "auto",
*, *,
model_kwargs: Optional[dict[str, Any]] = None, model_kwargs: dict[str, Any] | None = None,
trust_remote_code: bool = True, trust_remote_code: bool = True,
is_sentence_transformer: bool = False, is_sentence_transformer: bool = False,
is_cross_encoder: bool = False, is_cross_encoder: bool = False,
@@ -984,14 +914,15 @@ class HfRunner:
) )
# in case some unquantized custom models are not in same dtype # in case some unquantized custom models are not in same dtype
if (getattr(model, "quantization_method", None) is None if getattr(model, "quantization_method", None) is None and any(
and any(p.dtype != self.dtype p.dtype != self.dtype for p in model.parameters()
for p in model.parameters())): ):
model = model.to(dtype=self.dtype) model = model.to(dtype=self.dtype)
if (getattr(model, "quantization_method", None) != "bitsandbytes" if (
and len({p.device getattr(model, "quantization_method", None) != "bitsandbytes"
for p in model.parameters()}) < 2): and len({p.device for p in model.parameters()}) < 2
):
model = model.to(device=self.device) model = model.to(device=self.device)
self.model = model self.model = model
@@ -1006,6 +937,7 @@ class HfRunner:
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor # noqa: F401
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
model_name, model_name,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
@@ -1017,10 +949,10 @@ class HfRunner:
def get_inputs( def get_inputs(
self, self,
prompts: list[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: PromptImageInput | None = None,
videos: Optional[PromptVideoInput] = None, videos: PromptVideoInput | None = None,
audios: Optional[PromptAudioInput] = None, audios: PromptAudioInput | None = None,
) -> list[Union[BatchFeature, BatchEncoding]]: ) -> list[BatchFeature | BatchEncoding]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
@@ -1030,7 +962,7 @@ class HfRunner:
if audios is not None: if audios is not None:
assert len(prompts) == len(audios) assert len(prompts) == len(audios)
all_inputs: list[Union[BatchFeature, BatchEncoding]] = [] all_inputs: list[BatchFeature | BatchEncoding] = []
for i, prompt in enumerate(prompts): for i, prompt in enumerate(prompts):
processor_kwargs: dict[str, Any] = { processor_kwargs: dict[str, Any] = {
"text": prompt, "text": prompt,
@@ -1076,16 +1008,11 @@ class HfRunner:
return outputs return outputs
def encode(self, prompts: list[str], *args, def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]:
**kwargs) -> list[list[torch.Tensor]]:
return self.model.encode(prompts, *args, **kwargs) return self.model.encode(prompts, *args, **kwargs)
def predict(self, prompts: list[list[str]], *args, def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
**kwargs) -> torch.Tensor: return self.model.predict(prompts, *args, convert_to_tensor=True, **kwargs)
return self.model.predict(prompts,
*args,
convert_to_tensor=True,
**kwargs)
def __enter__(self): def __enter__(self):
return self return self
@@ -1103,22 +1030,25 @@ def ilama_lora_files():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def llama32_lora_files(): def llama32_lora_files():
from huggingface_hub import snapshot_download as hf_snapshot_download from huggingface_hub import snapshot_download as hf_snapshot_download
return hf_snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider", local_files_only=True) return hf_snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider", local_files_only=True)
def qwen_prompt(questions: list[str]) -> list[str]: def qwen_prompt(questions: list[str]) -> list[str]:
placeholder = "<|image_pad|>" placeholder = "<|image_pad|>"
return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" return [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions] f"{q}<|im_end|>\n<|im_start|>assistant\n"
)
for q in questions
]
def hunyuan_prompt(questions: list[str]) -> list[str]: def hunyuan_prompt(questions: list[str]) -> list[str]:
placeholder = "<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501 placeholder = "<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501
return [ return [f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>" for question in questions]
f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>"
for question in questions
]
PROMPT_CONFIGS = { PROMPT_CONFIGS = {

View File

@@ -17,11 +17,11 @@
# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py # Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
# #
from typing import Dict, List, Optional, Sequence, Tuple, Union from collections.abc import Sequence
from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.logprobs import PromptLogprobs, SampleLogprobs
TokensText = Tuple[List[int], str] TokensText = tuple[list[int], str]
def check_outputs_equal( def check_outputs_equal(
@@ -37,18 +37,18 @@ def check_outputs_equal(
""" """
assert len(outputs_0_lst) == len(outputs_1_lst) assert len(outputs_0_lst) == len(outputs_1_lst)
for prompt_idx, (outputs_0, for prompt_idx, (outputs_0, outputs_1) in enumerate(zip(outputs_0_lst, outputs_1_lst)):
outputs_1) in enumerate(zip(outputs_0_lst,
outputs_1_lst)):
output_ids_0, output_str_0 = outputs_0 output_ids_0, output_str_0 = outputs_0
output_ids_1, output_str_1 = outputs_1 output_ids_1, output_str_1 = outputs_1
# The text and token outputs should exactly match # The text and token outputs should exactly match
fail_msg = (f"Test{prompt_idx}:" fail_msg = (
f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}" f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}" f"\n{name_1}:\t{output_str_1!r}"
f"\n{name_0}:\t{output_ids_0!r}" f"\n{name_0}:\t{output_ids_0!r}"
f"\n{name_1}:\t{output_ids_1!r}") f"\n{name_1}:\t{output_ids_1!r}"
)
assert output_str_0 == output_str_1, fail_msg assert output_str_0 == output_str_1, fail_msg
assert output_ids_0 == output_ids_1, fail_msg assert output_ids_0 == output_ids_1, fail_msg
@@ -60,9 +60,7 @@ def check_outputs_equal(
# * List of top sample logprobs for each sampled token # * List of top sample logprobs for each sampled token
# #
# Assumes prompt logprobs were not requested. # Assumes prompt logprobs were not requested.
TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, TokensTextLogprobs = tuple[list[int], str, list[dict[int, float]] | SampleLogprobs | None]
float]],
SampleLogprobs]]]
# Representation of generated sequence as a tuple of # Representation of generated sequence as a tuple of
# * Token ID list # * Token ID list
@@ -71,6 +69,9 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
# * Optional list of top prompt logprobs for each prompt token # * Optional list of top prompt logprobs for each prompt token
# #
# Allows prompt logprobs to be requested. # Allows prompt logprobs to be requested.
TokensTextLogprobsPromptLogprobs = Tuple[ TokensTextLogprobsPromptLogprobs = tuple[
List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], list[int],
Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] str,
list[dict[int, float]] | SampleLogprobs | None,
list[dict[int, float] | None] | PromptLogprobs | None,
]

View File

@@ -55,16 +55,12 @@ def report_dir(pytestconfig):
def pytest_generate_tests(metafunc): def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames: if "config_filename" in metafunc.fixturenames:
if metafunc.config.getoption("--config-list-file"): if metafunc.config.getoption("--config-list-file"):
rel_path = metafunc.config.getoption("--config-list-file") rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve() config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f: with open(config_list_file, encoding="utf-8") as f:
configs = [ configs = [config_dir / line.strip() for line in f if line.strip() and not line.startswith("#")]
config_dir / line.strip() for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs) metafunc.parametrize("config_filename", configs)
else: else:
single_config = metafunc.config.getoption("--config") single_config = metafunc.config.getoption("--config")

View File

@@ -24,16 +24,15 @@ class EnvConfig:
@pytest.fixture @pytest.fixture
def env_config() -> EnvConfig: def env_config() -> EnvConfig:
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'), return EnvConfig(
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'), vllm_version=os.getenv("VLLM_VERSION", "unknown"),
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION', vllm_commit=os.getenv("VLLM_COMMIT", "unknown"),
'unknown'), vllm_ascend_version=os.getenv("VLLM_ASCEND_VERSION", "unknown"),
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT', vllm_ascend_commit=os.getenv("VLLM_ASCEND_COMMIT", "unknown"),
'unknown'), cann_version=os.getenv("CANN_VERSION", "unknown"),
cann_version=os.getenv('CANN_VERSION', 'unknown'), torch_version=os.getenv("TORCH_VERSION", "unknown"),
torch_version=os.getenv('TORCH_VERSION', 'unknown'), torch_npu_version=os.getenv("TORCH_NPU_VERSION", "unknown"),
torch_npu_version=os.getenv('TORCH_NPU_VERSION', )
'unknown'))
def build_model_args(eval_config, tp_size): def build_model_args(eval_config, tp_size):
@@ -48,9 +47,13 @@ def build_model_args(eval_config, tp_size):
"max_model_len": max_model_len, "max_model_len": max_model_len,
} }
for s in [ for s in [
"max_images", "gpu_memory_utilization", "enable_expert_parallel", "max_images",
"tensor_parallel_size", "enforce_eager", "enable_thinking", "gpu_memory_utilization",
"quantization" "enable_expert_parallel",
"tensor_parallel_size",
"enforce_eager",
"enable_thinking",
"quantization",
]: ]:
val = eval_config.get(s, None) val = eval_config.get(s, None)
if val is not None: if val is not None:
@@ -68,7 +71,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
model_args = build_model_args(eval_config, tp_size) model_args = build_model_args(eval_config, tp_size)
parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}" parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
if model_args.get('enable_expert_parallel', False): if model_args.get("enable_expert_parallel", False):
parallel_mode += " + EP" parallel_mode += " + EP"
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}" execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
@@ -93,17 +96,16 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
num_fewshot=eval_config.get("num_fewshot", "N/A"), num_fewshot=eval_config.get("num_fewshot", "N/A"),
rows=report_data["rows"], rows=report_data["rows"],
parallel_mode=parallel_mode, parallel_mode=parallel_mode,
execution_model=execution_model) execution_model=execution_model,
)
report_output = os.path.join( report_output = os.path.join(report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
os.makedirs(os.path.dirname(report_output), exist_ok=True) os.makedirs(os.path.dirname(report_output), exist_ok=True)
with open(report_output, 'w', encoding='utf-8') as f: with open(report_output, "w", encoding="utf-8") as f:
f.write(report_content) f.write(report_content)
def test_lm_eval_correctness_param(config_filename, tp_size, report_dir, def test_lm_eval_correctness_param(config_filename, tp_size, report_dir, env_config):
env_config):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
model_args = build_model_args(eval_config, tp_size) model_args = build_model_args(eval_config, tp_size)
success = True success = True
@@ -135,25 +137,26 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
metric_name = metric["name"] metric_name = metric["name"]
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = round(task_result[metric_name], 4) measured_value = round(task_result[metric_name], 4)
task_success = bool( task_success = bool(np.isclose(ground_truth, measured_value, rtol=RTOL))
np.isclose(ground_truth, measured_value, rtol=RTOL))
success = success and task_success success = success and task_success
print(f"{task_name} | {metric_name}: " print(
f"{task_name} | {metric_name}: "
f"ground_truth={ground_truth} | measured={measured_value} | " f"ground_truth={ground_truth} | measured={measured_value} | "
f"success={'' if task_success else ''}") f"success={'' if task_success else ''}"
)
report_data["rows"].append({ report_data["rows"].append(
"task": {
task_name, "task": task_name,
"metric": "metric": metric_name,
metric_name, "value": f"{measured_value}" if success else f"{measured_value}",
"value": "stderr": task_result[
f"{measured_value}" if success else f"{measured_value}", metric_name.replace(",", "_stderr,")
"stderr": if metric_name == "acc,none"
task_result[ else metric_name.replace(",", "_stderr,")
metric_name.replace(',', '_stderr,') if metric_name == ],
"acc,none" else metric_name.replace(',', '_stderr,')] }
}) )
generate_report(tp_size, eval_config, report_data, report_dir, env_config) generate_report(tp_size, eval_config, report_data, report_dir, env_config)
assert success assert success

View File

@@ -18,15 +18,12 @@
from __future__ import annotations from __future__ import annotations
import math
import os import os
import random
from typing import Any, Union
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import SamplingParams
from vllm.config import CompilationConfig from vllm.config import CompilationConfig
from vllm.v1.metrics.reader import Counter, Vector from vllm.v1.metrics.reader import Counter, Vector
@@ -101,7 +98,8 @@ def test_eagle3_sp_acceptance(
[prompt], [prompt],
tokenize=False, tokenize=False,
add_generation_prompt=True, add_generation_prompt=True,
) for prompt in prompts )
for prompt in prompts
] ]
speculative_config = { speculative_config = {
@@ -112,8 +110,7 @@ def test_eagle3_sp_acceptance(
"model": spec_model_name, "model": spec_model_name,
} }
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", cudagraph_capture_sizes=[12])
cudagraph_capture_sizes=[12])
with VllmRunner( with VllmRunner(
main_model_name, main_model_name,
@@ -142,10 +139,7 @@ def test_eagle3_sp_acceptance(
for pos in range(len(metric.values)): for pos in range(len(metric.values)):
num_accepted_tokens_per_pos[pos] += metric.values[pos] num_accepted_tokens_per_pos[pos] += metric.values[pos]
acceptance_per_pos = [ acceptance_per_pos = [num_accepted_tokens / num_drafts for num_accepted_tokens in num_accepted_tokens_per_pos]
num_accepted_tokens / num_drafts
for num_accepted_tokens in num_accepted_tokens_per_pos
]
golden = BASELINES_SP[method] golden = BASELINES_SP[method]
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden)) match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))

View File

@@ -25,8 +25,8 @@ import pytest
import torch import torch
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
from tests.e2e.conftest import wait_until_npu_memory_free from tests.e2e.conftest import wait_until_npu_memory_free
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
MODELS = [ MODELS = [
# Offline data parallel mode will be not supported/useful for dense models # Offline data parallel mode will be not supported/useful for dense models
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
] ]
for cls, method, counter in hooks: for cls, method, counter in hooks:
stack.enter_context( stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))
patch.object(cls, method, make_spy(cls, method, counter)))
return stack return stack
@@ -75,18 +74,19 @@ def _run_worker_process(
max_tokens: int, max_tokens: int,
): ):
"""Main entry point for the worker process.""" """Main entry point for the worker process."""
os.environ.update({ os.environ.update(
{
"VLLM_DP_RANK": str(rank), "VLLM_DP_RANK": str(rank),
"VLLM_DP_RANK_LOCAL": str(local_rank), "VLLM_DP_RANK_LOCAL": str(local_rank),
"VLLM_DP_SIZE": str(world_size), "VLLM_DP_SIZE": str(world_size),
"VLLM_DP_MASTER_IP": master_ip, "VLLM_DP_MASTER_IP": master_ip,
"VLLM_DP_MASTER_PORT": str(master_port), "VLLM_DP_MASTER_PORT": str(master_port),
}) }
)
# Import vLLM only after environment setup # Import vLLM only after environment setup
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
destroy_distributed_environment, destroy_model_parallel)
# Apply hooks and run inference # Apply hooks and run inference
with _install_spies(counters): with _install_spies(counters):
@@ -100,23 +100,20 @@ def _run_worker_process(
# Simple data sharding # Simple data sharding
chunk_size = len(prompts) // world_size chunk_size = len(prompts) // world_size
start_idx = rank * chunk_size start_idx = rank * chunk_size
end_idx = start_idx + chunk_size if rank < world_size - 1 else len( end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
prompts)
local_prompts = prompts[start_idx:end_idx] local_prompts = prompts[start_idx:end_idx]
llm = LLM( llm = LLM(
model=model_path, model=model_path,
quantization="ascend" if "W8A8" in model_path else None, quantization="ascend" if "W8A8" in model_path else None,
enable_expert_parallel=True if "DeepSeek" in model_path else False, enable_expert_parallel="DeepSeek" in model_path,
trust_remote_code=True, trust_remote_code=True,
) )
# Expose model config to the main test process # Expose model config to the main test process
counters["hidden_layers"].value = ( counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
llm.generate(local_prompts, llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))
SamplingParams(max_tokens=max_tokens, temperature=0.0))
# Explicit cleanup is mandatory in multi-process vLLM tests # Explicit cleanup is mandatory in multi-process vLLM tests
del llm del llm
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
for rank in range(dp_size): for rank in range(dp_size):
p = multiprocessing.Process( p = multiprocessing.Process(
target=_run_worker_process, target=_run_worker_process,
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
max_tokens),
) )
p.start() p.start()
workers.append(p) workers.append(p)
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
for k in workers: for k in workers:
if k.is_alive(): if k.is_alive():
k.kill() k.kill()
raise RuntimeError( raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")
f"Worker {p.pid} failed with exit code {p.exitcode}")
actual_capture = counters["capture"].value actual_capture = counters["capture"].value
actual_replay = counters["replay"].value actual_replay = counters["replay"].value
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
num_layers = counters["hidden_layers"].value num_layers = counters["hidden_layers"].value
num_acl_graphs = num_layers + 1 num_acl_graphs = num_layers + 1
num_comm_groups = sum(1 for s in [dp_size, 1] num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1) # dp_size=2, tp_size=1
if s > 1) # dp_size=2, tp_size=1
# Metric 1: Graph Capture (ACL Graph Construction) # Metric 1: Graph Capture (ACL Graph Construction)
# Ref: vllm_ascend.utils.update_aclgraph_sizes # Ref: vllm_ascend.utils.update_aclgraph_sizes
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))
num_acl_graphs / (1 + num_comm_groups * 2))
expected_capture = max_batch_sizes * num_acl_graphs * dp_size expected_capture = max_batch_sizes * num_acl_graphs * dp_size
assert ( assert actual_capture == expected_capture, (
actual_capture == expected_capture f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}" )
# Metric 2: Model Execution (NPUModelRunner.execute_model) # Metric 2: Model Execution (NPUModelRunner.execute_model)
# vLLM Step Breakdown: # vLLM Step Breakdown:
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
# vllm default enables Async scheduler, this will take 1 more steps # vllm default enables Async scheduler, this will take 1 more steps
expected_exec_model = (total_steps + 1 + 1) * dp_size expected_exec_model = (total_steps + 1 + 1) * dp_size
assert ( assert num_execute_model == expected_exec_model, (
num_execute_model == expected_exec_model f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}" )
# Metric 3: Dummy Runs (Warmup & Alignment) # Metric 3: Dummy Runs (Warmup & Alignment)
# vLLM synchronizes globally every 32 steps. # vLLM synchronizes globally every 32 steps.
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
expected_dummy_run = (warmup_runs + padding_runs) * dp_size expected_dummy_run = (warmup_runs + padding_runs) * dp_size
assert ( assert num_dummy_run == expected_dummy_run, (
num_dummy_run == expected_dummy_run f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}" )
# Metric 4: Graph Replay (Inference Execution) # Metric 4: Graph Replay (Inference Execution)
# Replays happen for every aligned step across all graphs. # Replays happen for every aligned step across all graphs.
expected_replay = num_acl_graphs * aligned_steps * dp_size expected_replay = num_acl_graphs * aligned_steps * dp_size
assert ( assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
actual_replay == expected_replay
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"

View File

@@ -64,12 +64,8 @@ def test_qwen3_inference_dp2(model, max_tokens):
cmd.append("ascend") cmd.append("ascend")
print(f"Running subprocess: {' '.join(cmd)}") print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(cmd, proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=600)
env=env, output = proc.stdout.decode(errors="ignore")
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600)
output = proc.stdout.decode(errors='ignore')
print(output) print(output)

View File

@@ -27,6 +27,7 @@ MODELS = [
SHARED_STORAGE_PATH = "/dev/shm/epd/storage" SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
TENSOR_PARALLELS = [1] TENSOR_PARALLELS = [1]
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@@ -36,36 +37,61 @@ async def test_models(model: str, tp_size: int) -> None:
vllm_server_args = [ vllm_server_args = [
[ [
"--port", "--port",
str(encode_port), "--model", model, "--gpu-memory-utilization", str(encode_port),
"0.01", "--tensor-parallel-size", "--model",
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching", model,
"--max-model-len", "10000", "--max-num-batched-tokens", "10000", "--gpu-memory-utilization",
"--max-num-seqs", "1", "--ec-transfer-config", "0.01",
'{"ec_connector_extra_config":{"shared_storage_path":"' + "--tensor-parallel-size",
SHARED_STORAGE_PATH + str(tp_size),
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}' "--enforce-eager",
"--no-enable-prefix-caching",
"--max-model-len",
"10000",
"--max-num-batched-tokens",
"10000",
"--max-num-seqs",
"1",
"--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"'
+ SHARED_STORAGE_PATH
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}',
], ],
[ [
"--port", "--port",
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95", str(pd_port),
"--model",
model,
"--gpu-memory-utilization",
"0.95",
"--tensor-parallel-size", "--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--max-model-len", "10000", str(tp_size),
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128", "--enforce-eager",
"--max-model-len",
"10000",
"--max-num-batched-tokens",
"10000",
"--max-num-seqs",
"128",
"--ec-transfer-config", "--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' + '{"ec_connector_extra_config":{"shared_storage_path":"'
SHARED_STORAGE_PATH + + SHARED_STORAGE_PATH
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}' + '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}',
] ],
] ]
proxy_port = get_open_port() proxy_port = get_open_port()
proxy_args = [ proxy_args = [
"--host", "127.0.0.1", "--port", "--host",
str(proxy_port), "--encode-servers-urls", "127.0.0.1",
f"http://localhost:{encode_port}", "--decode-servers-urls", "--port",
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable" str(proxy_port),
"--encode-servers-urls",
f"http://localhost:{encode_port}",
"--decode-servers-urls",
f"http://localhost:{pd_port}",
"--prefill-servers-urls",
"disable",
] ]
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _: with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _, DisaggEpdProxy(proxy_args=proxy_args) as proxy:
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
send_image_request(model, proxy) send_image_request(model, proxy)

View File

@@ -15,15 +15,12 @@ def test_deepseek_correctness_ep(model_name):
max_tokens = 5 max_tokens = 5
# FIXME: Really strange that chunked prefill might lead to different results, investigate further # FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(model_name, with VllmRunner(model_name, cudagraph_capture_sizes=[1, 2, 4, 8], tensor_parallel_size=2) as vllm_model:
cudagraph_capture_sizes=[1, 2, 4, 8],
tensor_parallel_size=2) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens) tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model_name, with VllmRunner(
tensor_parallel_size=2, model_name, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], enable_expert_parallel=True
cudagraph_capture_sizes=[1, 2, 4, 8], ) as vllm_model:
enable_expert_parallel=True) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens) ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(

View File

@@ -29,6 +29,7 @@ from unittest.mock import patch
import pytest import pytest
import torch_npu import torch_npu
from modelscope import snapshot_download # type: ignore from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import wait_until_npu_memory_free from tests.e2e.conftest import wait_until_npu_memory_free
MODELS = ["Qwen/Qwen3-0.6B"] MODELS = ["Qwen/Qwen3-0.6B"]
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"}) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
def test_qwen3_external_launcher(model): def test_qwen3_external_launcher(model):
script = Path( script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy() env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards # TODO: Change to 2 when ci machine has 4 cards
cmd = [ cmd = [
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
timeout=600, timeout=600,
) )
output = proc.stdout.decode(errors='ignore') output = proc.stdout.decode(errors="ignore")
print(output) print(output)
@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
@pytest.mark.parametrize("model", MOE_MODELS) @pytest.mark.parametrize("model", MOE_MODELS)
@wait_until_npu_memory_free() @wait_until_npu_memory_free()
def test_qwen3_moe_external_launcher_ep_tp2(model): def test_qwen3_moe_external_launcher_ep_tp2(model):
script = Path( script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy() env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards # TODO: Change to 2 when ci machine has 4 cards
cmd = [ cmd = [
sys.executable, sys.executable,
str(script), "--model", model, "--tp-size", "2", "--node-size", "1", str(script),
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code", "--model",
"--enable-expert-parallel" model,
"--tp-size",
"2",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-expert-parallel",
] ]
print(f"Running subprocess: {' '.join(cmd)}") print(f"Running subprocess: {' '.join(cmd)}")
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
timeout=600, timeout=600,
) )
output = proc.stdout.decode(errors='ignore') output = proc.stdout.decode(errors="ignore")
print(output) print(output)
@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
@wait_until_npu_memory_free() @wait_until_npu_memory_free()
def test_qwen3_external_launcher_with_sleepmode(): def test_qwen3_external_launcher_with_sleepmode():
script = Path( script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy() env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards # TODO: Change to 2 when ci machine has 4 cards
cmd = [ cmd = [
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
timeout=300, timeout=300,
) )
output = proc.stdout.decode(errors='ignore') output = proc.stdout.decode(errors="ignore")
print(output) print(output)
@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_external_launcher_with_sleepmode_level2(): def test_qwen3_external_launcher_with_sleepmode_level2():
script = Path( script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy() env = os.environ.copy()
model_path = snapshot_download("Qwen/Qwen3-8B") model_path = snapshot_download("Qwen/Qwen3-8B")
# TODO: Add moe model test # TODO: Add moe model test
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
timeout=300, timeout=300,
) )
output = proc.stdout.decode(errors='ignore') output = proc.stdout.decode(errors="ignore")
print(output) print(output)
@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
) )
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@wait_until_npu_memory_free() @wait_until_npu_memory_free()
@patch.dict(os.environ, { @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
"HCCL_BUFFSIZE": "500"
})
def test_qwen3_external_launcher_with_matmul_allreduce(model): def test_qwen3_external_launcher_with_matmul_allreduce(model):
script = Path( script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy() env = os.environ.copy()
cmd = [ cmd = [
sys.executable, sys.executable,
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
timeout=600, timeout=600,
) )
output = proc.stdout.decode(errors='ignore') output = proc.stdout.decode(errors="ignore")
print(output) print(output)
assert "Generated text:" in output assert "Generated text:" in output

View File

@@ -26,23 +26,23 @@ from tests.e2e.model_utils import check_outputs_equal
def test_qwen3_moe_full_decode_only_tp2(): def test_qwen3_moe_full_decode_only_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ: if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE'] del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [ prompts = [
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The president of the United States is",
"The capital of France is",
"The future of AI is",
] ]
model = "Qwen/Qwen3-30B-A3B" model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0) sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model, with VllmRunner(
model,
max_model_len=1024, max_model_len=1024,
tensor_parallel_size=2, tensor_parallel_size=2,
compilation_config={ compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
"cudagraph_mode": "FULL_DECODE_ONLY", ) as runner:
"cudagraph_capture_sizes": [4, 8, 24, 48, 60] vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner( with VllmRunner(
model, model,
@@ -54,13 +54,11 @@ def test_qwen3_moe_full_decode_only_tp2():
vllm_fullgraph_outputs_list = [] vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs: for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append( vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = [] vllm_eager_outputs_list = []
for output in vllm_eager_outputs: for output in vllm_eager_outputs:
vllm_eager_outputs_list.append( vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal( check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list, outputs_0_lst=vllm_eager_outputs_list,
@@ -72,23 +70,23 @@ def test_qwen3_moe_full_decode_only_tp2():
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me") @pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
def test_qwen3_moe_full_graph_tp2(): def test_qwen3_moe_full_graph_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ: if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE'] del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [ prompts = [
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The president of the United States is",
"The capital of France is",
"The future of AI is",
] ]
model = "Qwen/Qwen3-30B-A3B" model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0) sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model, with VllmRunner(
model,
max_model_len=1024, max_model_len=1024,
tensor_parallel_size=2, tensor_parallel_size=2,
compilation_config={ compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
"cudagraph_mode": "FULL", ) as runner:
"cudagraph_capture_sizes": [4, 8, 24, 48, 60] vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner( with VllmRunner(
model, model,
@@ -100,13 +98,11 @@ def test_qwen3_moe_full_graph_tp2():
vllm_fullgraph_outputs_list = [] vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs: for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append( vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = [] vllm_eager_outputs_list = []
for output in vllm_eager_outputs: for output in vllm_eager_outputs:
vllm_eager_outputs_list.append( vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal( check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list, outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -1,8 +1,7 @@
import pytest import pytest
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, from tests.e2e.singlecard.test_ilama_lora import EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample
MODEL_PATH, do_sample)
@pytest.mark.parametrize("distributed_executor_backend", ["mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["mp"])

View File

@@ -20,8 +20,10 @@
Run `pytest tests/test_offline_inference.py`. Run `pytest tests/test_offline_inference.py`.
""" """
import os import os
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
from vllm import SamplingParams from vllm import SamplingParams
@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
"unsloth/gpt-oss-20b-BF16", "unsloth/gpt-oss-20b-BF16",
] ]
def test_deepseek_multistream_moe_tp2(): def test_deepseek_multistream_moe_tp2():
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
sampling_params = SamplingParams(max_tokens=5, sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-30B-A3B", with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype="auto", dtype="auto",
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend="mp", distributed_executor_backend="mp",
compilation_config={"pass_config": { compilation_config={"pass_config": {"enable_sp": True}},
"enable_sp": True
}},
enable_expert_parallel=True, enable_expert_parallel=True,
enforce_eager=True) as vllm_model: enforce_eager=True,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params) vllm_model.generate(example_prompts, sampling_params)
@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"}) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
def test_deepseek_w4a8_accuracy_tp2(model): def test_deepseek_w4a8_accuracy_tp2(model):
prompts = [ prompts = [
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs" "The president of the United States is",
] "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
vllm_ds_w4a8_answers = [
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
] ]
vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
sampling_params = SamplingParams(max_tokens=5, temperature=0.0) sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
with VllmRunner(model, with VllmRunner(
model,
dtype="auto", dtype="auto",
tensor_parallel_size=2, tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8], cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend", quantization="ascend",
enable_expert_parallel=True) as vllm_model: enable_expert_parallel=True,
vllm_quant_outputs = vllm_model.model.generate(prompts, ) as vllm_model:
sampling_params) vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)
vllm_quant_outputs_list = [] vllm_quant_outputs_list = []
for output in vllm_quant_outputs: for output in vllm_quant_outputs:
vllm_quant_outputs_list.append( vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
([output.outputs[0].index], output.outputs[0].text))
vllm_answer_list = [] vllm_answer_list = []
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers]) vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]
check_outputs_equal(outputs_0_lst=vllm_answer_list, check_outputs_equal(
outputs_0_lst=vllm_answer_list,
outputs_1_lst=vllm_quant_outputs_list, outputs_1_lst=vllm_quant_outputs_list,
name_0="vllm_quant_outputs", name_0="vllm_quant_outputs",
name_1="vllm_answer_outputs") name_1="vllm_answer_outputs",
)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
sampling_params = SamplingParams(max_tokens=5, sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-30B-A3B", with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype="auto", dtype="auto",
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend="mp", distributed_executor_backend="mp",
enable_expert_parallel=True, enable_expert_parallel=True,
enforce_eager=True) as vllm_model: enforce_eager=True,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params) vllm_model.generate(example_prompts, sampling_params)
@@ -168,10 +168,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
sampling_params = SamplingParams(max_tokens=5, sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner( with VllmRunner(
"Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-30B-A3B",
@@ -179,9 +176,9 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend="mp", distributed_executor_backend="mp",
enable_expert_parallel=True, enable_expert_parallel=True,
enforce_eager= enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen additional_config={"layer_sharding": ["o_proj"]},
additional_config={"layer_sharding": ["o_proj"]}) as vllm_model: ) as vllm_model:
vllm_model.generate(example_prompts, sampling_params) vllm_model.generate(example_prompts, sampling_params)
@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
example_prompts = [ example_prompts = [
"test" * 1001, "test" * 1001,
] ]
sampling_params = SamplingParams(max_tokens=5, sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
temperature=0.0, with VllmRunner(
top_k=50, "vllm-ascend/DeepSeek-V2-Lite-W8A8",
top_p=0.9)
with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
dtype="auto", dtype="auto",
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend="mp", distributed_executor_backend="mp",
enable_expert_parallel=True, enable_expert_parallel=True,
enforce_eager=True, enforce_eager=True,
quantization="ascend") as vllm_model: quantization="ascend",
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params) vllm_model.generate(example_prompts, sampling_params)
@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
"Hello ", "Hello ",
] ]
# "max_position_embeddings": 163840, # "max_position_embeddings": 163840,
long_example_prompts = [ long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
"Hello " * (163839 - 500) + "Hello"
]
max_tokens = 500 max_tokens = 500
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning", with VllmRunner(
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
tensor_parallel_size=2, tensor_parallel_size=2,
quantization="ascend", quantization="ascend",
enable_expert_parallel=True, enable_expert_parallel=True,
max_model_len=163840, max_model_len=163840,
compilation_config={ compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
"cudagraph_mode": "FULL_DECODE_ONLY" additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
},
speculative_config={
"num_speculative_tokens": 1,
"method": "deepseek_mtp"
},
additional_config={
"layer_sharding":["q_b_proj", "o_proj"]
},
reasoning_parser="deepseek_v3", reasoning_parser="deepseek_v3",
tokenizer_mode="deepseek_v32") as vllm_model: tokenizer_mode="deepseek_v32",
) as vllm_model:
vllm_model.generate_greedy(short_example_prompts, max_tokens) vllm_model.generate_greedy(short_example_prompts, max_tokens)
vllm_model.generate_greedy(long_example_prompts, max_tokens) vllm_model.generate_greedy(long_example_prompts, max_tokens)

View File

@@ -32,9 +32,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_offline_load_and_sleepmode_tp2(model): def test_qwen3_offline_load_and_sleepmode_tp2(model):
script = Path( script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy() env = os.environ.copy()
cmd = [ cmd = [
sys.executable, sys.executable,
@@ -65,7 +63,7 @@ def test_qwen3_offline_load_and_sleepmode_tp2(model):
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
timeout=600, timeout=600,
) )
output = proc.stdout.decode(errors='ignore') output = proc.stdout.decode(errors="ignore")
print(output) print(output)

View File

@@ -37,12 +37,13 @@ prompts = [
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS) @pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND) @pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
def test_models_pp2(model: str, tp_size: int, pp_size: int, def test_models_pp2(model: str, tp_size: int, pp_size: int, distributed_executor_backend: str) -> None:
distributed_executor_backend: str) -> None: with VllmRunner(
with VllmRunner(model, model,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size, pipeline_parallel_size=pp_size,
cudagraph_capture_sizes=[1, 2, 4, 8], cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7,
) as vllm_model:
vllm_model.generate_greedy(prompts, 64) vllm_model.generate_greedy(prompts, 64)

View File

@@ -11,11 +11,14 @@ MODELS = [
# for MHA # for MHA
"Qwen/Qwen3-8B", "Qwen/Qwen3-8B",
# for MLA # for MLA
"deepseek-ai/DeepSeek-V2-Lite-Chat" "deepseek-ai/DeepSeek-V2-Lite-Chat",
] ]
# A prompt containing a large markdown table. The table is randomly generated by GPT-4. # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ # ruff: noqa: E501
LONG_PROMPT = (
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+ """
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | | ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
""" """
)
INPUT_PROMPTS = [ INPUT_PROMPTS = [
LONG_PROMPT + LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
"Question: what is the age of John Doe? Your answer: The age of John Doe is ", LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
LONG_PROMPT +
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
] ]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50]) @pytest.mark.parametrize("max_tokens", [50])
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None: def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
with VllmRunner(model, with VllmRunner(
model,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8], cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7,
prefix_cache_output = vllm_model.generate_greedy( ) as vllm_model:
INPUT_PROMPTS, max_tokens) prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
with VllmRunner(model, with VllmRunner(
model,
enable_prefix_caching=False, enable_prefix_caching=False,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8], cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens) vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
check_outputs_equal( check_outputs_equal(

View File

@@ -16,7 +16,6 @@
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
# #
import pytest
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
@@ -36,7 +35,7 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [ golden_results = [
'The president of the United States is the head of state and', "The president of the United States is the head of state and",
] ]
for i in range(len(vllm_output)): for i in range(len(vllm_output)):
@@ -58,13 +57,14 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [ golden_results = [
'The president of the United States is the head of state and', "The president of the United States is the head of state and",
] ]
for i in range(len(vllm_output)): for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1] assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}") print(f"Generated text: {vllm_output[i][1]!r}")
def test_qwen3_moe_w4a8_dynamic_llm_compressor(): def test_qwen3_moe_w4a8_dynamic_llm_compressor():
example_prompts = [ example_prompts = [
"The president of the United States is", "The president of the United States is",
@@ -79,7 +79,7 @@ def test_qwen3_moe_w4a8_dynamic_llm_compressor():
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [ golden_results = [
'The president of the United States is the head of state and', "The president of the United States is the head of state and",
] ]
for i in range(len(vllm_output)): for i in range(len(vllm_output)):

View File

@@ -59,7 +59,7 @@ def test_qwen3_moe_w8a8_distributed_tp2():
def test_qwen3_moe_distributed_aiv_tp2(): def test_qwen3_moe_distributed_aiv_tp2():
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' os.environ["HCCL_OP_EXPANSION_MODE"] = "AIV"
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
@@ -80,23 +80,24 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
port = get_open_port() port = get_open_port()
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]}) compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
server_args = [ server_args = [
"--max_model_len", "8192", "--tensor_parallel_size", "2", "--max_model_len",
"--enable_expert_parallel", "--quantization", "ascend", "--port", "8192",
str(port), "--compilation-config", compilation_config "--tensor_parallel_size",
"2",
"--enable_expert_parallel",
"--quantization",
"ascend",
"--port",
str(port),
"--compilation-config",
compilation_config,
] ]
env_dict = {"HCCL_BUFFSIZE": "1024"} env_dict = {"HCCL_BUFFSIZE": "1024"}
with RemoteOpenAIServer(model, with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
server_args,
server_port=port,
auto_port=False,
env_dict=env_dict) as server:
client = server.get_async_client() client = server.get_async_client()
batch = await client.completions.create(model=model, batch = await client.completions.create(
prompt="What is deeplearning?", model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
max_tokens=400, )
temperature=0,
top_p=1.0,
n=1)
gt_choices: list[openai.types.CompletionChoice] = batch.choices gt_choices: list[openai.types.CompletionChoice] = batch.choices
# dynamic eplb test # dynamic eplb test
@@ -108,22 +109,14 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
"dynamic_eplb": True, "dynamic_eplb": True,
"expert_heat_collection_interval": 100, "expert_heat_collection_interval": 100,
"algorithm_execution_interval": 20, "algorithm_execution_interval": 20,
"num_redundant_experts": 2 "num_redundant_experts": 2,
} }
} }
server_args.extend(["--additional-config", json.dumps(additional_config)]) server_args.extend(["--additional-config", json.dumps(additional_config)])
with RemoteOpenAIServer(model, with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
server_args,
server_port=port,
auto_port=False,
env_dict=env_dict) as server:
client = server.get_async_client() client = server.get_async_client()
batch = await client.completions.create(model=model, batch = await client.completions.create(
prompt="What is deeplearning?", model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
max_tokens=400, )
temperature=0,
top_p=1.0,
n=1)
eplb_choices: list[openai.types.CompletionChoice] = batch.choices eplb_choices: list[openai.types.CompletionChoice] = batch.choices
assert gt_choices[0].text == eplb_choices[ assert gt_choices[0].text == eplb_choices[0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"

View File

@@ -1,10 +1,11 @@
import os import os
from unittest.mock import patch from unittest.mock import patch
from tests.e2e.conftest import VllmRunner
from vllm import SamplingParams from vllm import SamplingParams
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from tests.e2e.conftest import VllmRunner
@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"}) @patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
def test_qwen3_moe_routing_replay(): def test_qwen3_moe_routing_replay():
@@ -20,10 +21,7 @@ def test_qwen3_moe_routing_replay():
enable_return_routed_experts=True, enable_return_routed_experts=True,
) as vllm_model: ) as vllm_model:
sampling_params = SamplingParams( sampling_params = SamplingParams(
max_tokens=5, max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
temperature=0.8,
top_p=0.95,
output_kind=RequestOutputKind.FINAL_ONLY
) )
inputs = vllm_model.get_inputs(prompts=prompts) inputs = vllm_model.get_inputs(prompts=prompts)
outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params) outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)

View File

@@ -84,11 +84,7 @@ async def test_models(model: str) -> None:
request_keyword_args: dict[str, Any] = { request_keyword_args: dict[str, Any] = {
**api_keyword_args, **api_keyword_args,
} }
with RemoteOpenAIServer(model, with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client() client = server.get_async_client()
batch = await client.completions.create( batch = await client.completions.create(
model=model, model=model,

View File

@@ -13,13 +13,14 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None: def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
if "HCCL_OP_EXPANSION_MODE" in os.environ:
if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ["HCCL_OP_EXPANSION_MODE"]
del os.environ['HCCL_OP_EXPANSION_MODE']
prompts = [ prompts = [
"Hello, my name is", "The capital of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The capital of the United States is",
"The capital of France is",
"The future of AI is",
] ]
sampling_params = SamplingParams(max_tokens=32, temperature=0.0) sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
@@ -43,8 +44,7 @@ def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
"enable_shared_expert_dp": True, "enable_shared_expert_dp": True,
}, },
) as runner: ) as runner:
shared_expert_dp_eager_outputs = runner.model.generate( shared_expert_dp_eager_outputs = runner.model.generate(prompts, sampling_params)
prompts, sampling_params)
with VllmRunner( with VllmRunner(
model, model,
@@ -59,23 +59,19 @@ def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
"enable_shared_expert_dp": True, "enable_shared_expert_dp": True,
}, },
) as runner: ) as runner:
shared_expert_dp_aclgraph_outputs = runner.model.generate( shared_expert_dp_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
prompts, sampling_params)
vllm_eager_outputs_list = [] vllm_eager_outputs_list = []
for output in vllm_eager_outputs: for output in vllm_eager_outputs:
vllm_eager_outputs_list.append( vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_eager_outputs_list = [] shared_expert_dp_eager_outputs_list = []
for output in shared_expert_dp_eager_outputs: for output in shared_expert_dp_eager_outputs:
shared_expert_dp_eager_outputs_list.append( shared_expert_dp_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_aclgraph_outputs_list = [] shared_expert_dp_aclgraph_outputs_list = []
for output in shared_expert_dp_aclgraph_outputs: for output in shared_expert_dp_aclgraph_outputs:
shared_expert_dp_aclgraph_outputs_list.append( shared_expert_dp_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal( check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list, outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -39,8 +39,7 @@ api_keyword_args = {
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS) @pytest.mark.parametrize("dp_size", DATA_PARALLELS)
async def test_models_single_request_aclgraph_dp2(model: str, async def test_models_single_request_aclgraph_dp2(model: str, dp_size: int) -> None:
dp_size: int) -> None:
port = get_open_port() port = get_open_port()
env_dict = { env_dict = {
"TASK_QUEUE_ENABLE": "1", "TASK_QUEUE_ENABLE": "1",
@@ -48,36 +47,51 @@ async def test_models_single_request_aclgraph_dp2(model: str,
} }
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
server_args = [ server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1", "--no-enable-prefix-caching",
"--tensor-parallel-size",
"1",
"--data-parallel-size", "--data-parallel-size",
str(dp_size), "--quantization", "ascend", "--max-model-len", str(dp_size),
"1024", "--port", "--quantization",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9" "ascend",
"--max-model-len",
"1024",
"--port",
str(port),
"--trust-remote-code",
"--gpu-memory-utilization",
"0.9",
] ]
else: else:
server_args = [ server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1", "--no-enable-prefix-caching",
"--tensor-parallel-size",
"1",
"--data-parallel-size", "--data-parallel-size",
str(dp_size), "--port", str(dp_size),
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9" "--port",
str(port),
"--trust-remote-code",
"--gpu-memory-utilization",
"0.9",
] ]
request_keyword_args: dict[str, Any] = { request_keyword_args: dict[str, Any] = {
**api_keyword_args, **api_keyword_args,
} }
with RemoteOpenAIServer(model, with RemoteOpenAIServer(
vllm_serve_args=server_args, model, vllm_serve_args=server_args, server_port=port, env_dict=env_dict, auto_port=False
server_port=port, ) as server:
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client() client = server.get_async_client()
try: try:
batch = await asyncio.wait_for(client.completions.create( batch = await asyncio.wait_for(
client.completions.create(
model=model, model=model,
prompt=prompts, prompt=prompts,
**request_keyword_args, **request_keyword_args,
), ),
timeout=10.0) timeout=10.0,
)
except asyncio.TimeoutError: except asyncio.TimeoutError:
pytest.fail("Model did not return response within 10 seconds") pytest.fail("Model did not return response within 10 seconds")

View File

@@ -1,5 +1,3 @@
import os
import pytest import pytest
from vllm import SamplingParams from vllm import SamplingParams
@@ -14,8 +12,10 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
def test_qwen3_vl_sp_tp2(model: str) -> None: def test_qwen3_vl_sp_tp2(model: str) -> None:
prompts = [ prompts = [
"Hello, my name is", "The capital of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The capital of the United States is",
"The capital of France is",
"The future of AI is",
] ]
sampling_params = SamplingParams(max_tokens=10, temperature=0.0) sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
@@ -26,9 +26,9 @@ def test_qwen3_vl_sp_tp2(model: str) -> None:
compilation_config={ compilation_config={
"cudagraph_capture_sizes": [2, 4], "cudagraph_capture_sizes": [2, 4],
"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_mode": "FULL_DECODE_ONLY",
"pass_config": {"enable_sp": False} "pass_config": {"enable_sp": False},
}, },
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}} additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}},
) as runner: ) as runner:
no_sp_outputs = runner.model.generate(prompts, sampling_params) no_sp_outputs = runner.model.generate(prompts, sampling_params)
@@ -39,22 +39,19 @@ def test_qwen3_vl_sp_tp2(model: str) -> None:
compilation_config={ compilation_config={
"cudagraph_capture_sizes": [2, 4], "cudagraph_capture_sizes": [2, 4],
"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_mode": "FULL_DECODE_ONLY",
"pass_config": {"enable_sp": True} "pass_config": {"enable_sp": True},
}, },
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}} additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}},
) as runner: ) as runner:
sp_outputs = runner.model.generate( sp_outputs = runner.model.generate(prompts, sampling_params)
prompts, sampling_params)
no_sp_outputs_list = [] no_sp_outputs_list = []
for output in no_sp_outputs: for output in no_sp_outputs:
no_sp_outputs_list.append( no_sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
sp_outputs_list = [] sp_outputs_list = []
for output in sp_outputs: for output in sp_outputs:
sp_outputs_list.append( sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal( check_outputs_equal(
outputs_0_lst=no_sp_outputs_list, outputs_0_lst=no_sp_outputs_list,