### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -51,34 +51,25 @@ line-length = 120
|
||||
# Folder to be modified
|
||||
exclude = [
|
||||
# Batch (1)
|
||||
"tests/e2e/__init__.py",
|
||||
"tests/e2e/310p/",
|
||||
"tests/e2e/conftest.py",
|
||||
"tests/e2e/doctests/",
|
||||
"tests/e2e/model_utils.py",
|
||||
"tests/e2e/models/",
|
||||
"tests/e2e/multicard/2-cards/",
|
||||
# "tests/e2e/__init__.py",
|
||||
# "tests/e2e/310p/",
|
||||
# "tests/e2e/conftest.py",
|
||||
# "tests/e2e/doctests/",
|
||||
# "tests/e2e/model_utils.py",
|
||||
# "tests/e2e/models/",
|
||||
# "tests/e2e/multicard/2-cards/",
|
||||
|
||||
# Batch (2)
|
||||
"tests/e2e/multicard/4-cards/",
|
||||
"tests/e2e/nightly/multi_node/",
|
||||
|
||||
# Batch (3)
|
||||
"tests/e2e/nightly/single_node/models/",
|
||||
|
||||
# Batch (4)
|
||||
"tests/e2e/nightly/single_node/ops/",
|
||||
|
||||
# Batch (5)
|
||||
# "tests/e2e/singlecard/",
|
||||
|
||||
# Batch (6)
|
||||
"tests/e2e/nightly/single_node/ops/singlecard_ops/triton/",
|
||||
"tests/e2e/singlecard/pooling/",
|
||||
"tests/e2e/singlecard/spec_decode/",
|
||||
"tests/e2e/utils.py",
|
||||
"tests/e2e/vllm_interface/",
|
||||
"tests/e2e/weekly/",
|
||||
|
||||
# Batch (3)
|
||||
"tests/e2e/nightly/single_node/",
|
||||
|
||||
"tests/ut/",
|
||||
]
|
||||
|
||||
@@ -15,28 +15,23 @@
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add 310p directory to sys.path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir) # 310p directory
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
# ruff: noqa: E402
|
||||
from test_utils import run_vl_model_test
|
||||
|
||||
|
||||
def test_qwen3_vl_8b_tp2_fp16():
|
||||
"""Qwen3-VL-8B dual-card FP16 test"""
|
||||
run_vl_model_test(
|
||||
model_name="Qwen/Qwen3-VL-8B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
max_tokens=5
|
||||
)
|
||||
run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=2, max_tokens=5)
|
||||
|
||||
|
||||
def test_qwen3_vl_32b_tp1_fp16():
|
||||
"""Qwen3-VL-32B 4-card FP16 test"""
|
||||
run_vl_model_test(
|
||||
model_name="Qwen/Qwen3-VL-32B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
max_tokens=5
|
||||
)
|
||||
run_vl_model_test(model_name="Qwen/Qwen3-VL-32B-Instruct", tensor_parallel_size=4, max_tokens=5)
|
||||
|
||||
@@ -15,20 +15,18 @@
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add 310p directory to sys.path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir) # 310p directory
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
# ruff: noqa: E402
|
||||
from test_utils import run_vl_model_test
|
||||
|
||||
|
||||
def test_qwen3_vl_8b_tp1_fp16():
|
||||
"""Qwen3-VL-8B single-card FP16 test"""
|
||||
run_vl_model_test(
|
||||
model_name="Qwen/Qwen3-VL-8B-Instruct",
|
||||
tensor_parallel_size=1,
|
||||
max_tokens=5
|
||||
)
|
||||
run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=1, max_tokens=5)
|
||||
|
||||
@@ -15,10 +15,12 @@
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def get_test_image():
|
||||
"""Get the image object for testing"""
|
||||
@@ -32,14 +34,12 @@ def get_test_prompts():
|
||||
return ["<|image_pad|>Describe this image in detail."]
|
||||
|
||||
|
||||
def run_vl_model_test(model_name: str,
|
||||
tensor_parallel_size: int,
|
||||
max_tokens: int,
|
||||
dtype: str = "float16",
|
||||
enforce_eager: bool = True):
|
||||
def run_vl_model_test(
|
||||
model_name: str, tensor_parallel_size: int, max_tokens: int, dtype: str = "float16", enforce_eager: bool = True
|
||||
):
|
||||
"""
|
||||
Generic visual language model test function
|
||||
|
||||
|
||||
Args:
|
||||
model_name: Model name, e.g., "Qwen/Qwen3-VL-4B"
|
||||
tensor_parallel_size: Tensor parallel size
|
||||
@@ -52,9 +52,6 @@ def run_vl_model_test(model_name: str,
|
||||
prompts = get_test_prompts()
|
||||
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
dtype=dtype
|
||||
model_name, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, dtype=dtype
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens, images=images)
|
||||
vllm_model.generate_greedy(prompts, max_tokens, images=images)
|
||||
|
||||
@@ -32,7 +32,7 @@ import threading
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Tuple, TypeVar, Union
|
||||
from typing import Any, TypeVar
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
@@ -44,23 +44,20 @@ from modelscope import snapshot_download # type: ignore[import-untyped]
|
||||
from PIL import Image
|
||||
from requests.exceptions import RequestException
|
||||
from torch import nn
|
||||
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
||||
BatchEncoding, BatchFeature)
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config.model import (ConvertOption, RunnerOption,
|
||||
_get_and_verify_dtype)
|
||||
from vllm.config.model import ConvertOption, RunnerOption, _get_and_verify_dtype
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.model_utils import (TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs)
|
||||
from tests.e2e.nightly.multi_node.scripts.multi_node_config import (
|
||||
DisaggregatedPrefillCfg, NodeInfo)
|
||||
from tests.e2e.model_utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
|
||||
from tests.e2e.nightly.multi_node.scripts.multi_node_config import DisaggregatedPrefillCfg, NodeInfo
|
||||
from vllm_ascend.ascend_config import clear_ascend_config
|
||||
|
||||
# TODO: remove this part after the patch merged into vllm, if
|
||||
# we not explicitly patch here, some of them might be effectiveless
|
||||
# in pytest scenario
|
||||
@@ -70,41 +67,41 @@ adapt_patch(True)
|
||||
adapt_patch(False)
|
||||
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
destroy_distributed_environment,
|
||||
destroy_model_parallel,
|
||||
)
|
||||
|
||||
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
|
||||
_M = TypeVar("_M")
|
||||
|
||||
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
|
||||
_PromptMultiModalInput = list[_M] | list[list[_M]]
|
||||
|
||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
|
||||
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
|
||||
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_TEST_DIR = os.path.dirname(__file__)
|
||||
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]
|
||||
|
||||
DISAGG_EPD_PROXY_SCRIPT = Path(
|
||||
__file__
|
||||
).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
|
||||
DISAGG_EPD_PROXY_SCRIPT = (
|
||||
Path(__file__).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
|
||||
)
|
||||
|
||||
|
||||
def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
|
||||
import torch_npu # type: ignore
|
||||
|
||||
# We can try to clean up memory in this subprocess, though it mostly affects this process.
|
||||
# But if there are any lingering contexts in this process (unlikely for a fresh spawn), it helps.
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
|
||||
|
||||
_, total_npu_memory = torch.npu.mem_get_info()
|
||||
start_time = time.time()
|
||||
|
||||
while True:
|
||||
free_bytes, _ = torch.npu.mem_get_info()
|
||||
if free_bytes / total_npu_memory >= target_free_percentage:
|
||||
print(f'check_npu_memory_worker: npu free memory decreased target value.')
|
||||
print("check_npu_memory_worker: npu free memory decreased target value.")
|
||||
return # Success
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -113,7 +110,7 @@ def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: fl
|
||||
print(
|
||||
f"Timeout: NPU memory free size did not reach "
|
||||
f"{target_free_percentage} of total npu memory within {max_wait_seconds} seconds.",
|
||||
file=sys.stderr
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1) # Failure
|
||||
|
||||
@@ -135,21 +132,19 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
|
||||
target_free_percentage (float): Target free memory percentage of total.
|
||||
max_wait_seconds (float): Maximum wait time in seconds.
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# Clean up non-NPU resources in the main process
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
# Use a spawned subprocess to check NPU memory to avoid initializing NPU in the main process
|
||||
ctx = multiprocessing.get_context("spawn")
|
||||
p = ctx.Process(
|
||||
target=_check_npu_memory_worker,
|
||||
args=(target_free_percentage, max_wait_seconds)
|
||||
)
|
||||
p = ctx.Process(target=_check_npu_memory_worker, args=(target_free_percentage, max_wait_seconds))
|
||||
p.start()
|
||||
p.join()
|
||||
|
||||
|
||||
if p.exitcode != 0:
|
||||
raise TimeoutError(
|
||||
f"Timeout: NPU memory free size did not reach "
|
||||
@@ -157,7 +152,9 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
|
||||
)
|
||||
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@@ -168,9 +165,10 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
||||
torch.distributed.destroy_process_group()
|
||||
if shutdown_ray:
|
||||
import ray # Lazy import Ray
|
||||
|
||||
ray.shutdown()
|
||||
gc.collect()
|
||||
|
||||
|
||||
# Only clean NPU cache if NPU is already initialized/available in this process.
|
||||
# This prevents accidental initialization of NPU context in the main process,
|
||||
# which would break subsequent forks.
|
||||
@@ -180,7 +178,6 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
||||
|
||||
|
||||
class MooncakeLauncher:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mooncake_port,
|
||||
@@ -228,14 +225,12 @@ class MooncakeLauncher:
|
||||
class RemoteOpenAIServer:
|
||||
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
|
||||
|
||||
def _start_server(self, model: str, server_cmd: list[str],
|
||||
env_dict: Optional[dict[str, str]]) -> None:
|
||||
"""Subclasses override this method to customize server process launch
|
||||
"""
|
||||
def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
|
||||
"""Subclasses override this method to customize server process launch"""
|
||||
env = os.environ.copy()
|
||||
# the current process might initialize npu,
|
||||
# to be safe, we should use spawn method
|
||||
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
if env_dict is not None:
|
||||
env.update(env_dict)
|
||||
logger.info(f"Starting server with command: {' '.join(server_cmd)}")
|
||||
@@ -247,47 +242,41 @@ class RemoteOpenAIServer:
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
vllm_serve_args: Union[list[str], str],
|
||||
*,
|
||||
server_host: str = '0.0.0.0',
|
||||
server_port: int = 8080,
|
||||
env_dict: Optional[dict[str, str]] = None,
|
||||
seed: Optional[int] = None,
|
||||
auto_port: bool = True,
|
||||
nodes_info: Optional[list[NodeInfo]] = None,
|
||||
disaggregated_prefill: Optional[DisaggregatedPrefillCfg] = None,
|
||||
proxy_port: Optional[int] = None,
|
||||
max_wait_seconds: Optional[float] = None,
|
||||
override_hf_configs: Optional[dict[str, Any]] = None) -> None:
|
||||
self,
|
||||
model: str,
|
||||
vllm_serve_args: list[str] | str,
|
||||
*,
|
||||
server_host: str = "0.0.0.0",
|
||||
server_port: int = 8080,
|
||||
env_dict: dict[str, str] | None = None,
|
||||
seed: int | None = None,
|
||||
auto_port: bool = True,
|
||||
nodes_info: list[NodeInfo] | None = None,
|
||||
disaggregated_prefill: DisaggregatedPrefillCfg | None = None,
|
||||
proxy_port: int | None = None,
|
||||
max_wait_seconds: float | None = None,
|
||||
override_hf_configs: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
if isinstance(vllm_serve_args, str):
|
||||
vllm_serve_args = shlex.split(vllm_serve_args)
|
||||
else:
|
||||
vllm_serve_args = ["vllm", "serve", model, *vllm_serve_args]
|
||||
if auto_port:
|
||||
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
|
||||
raise ValueError("You have manually specified the port "
|
||||
"when `auto_port=True`.")
|
||||
raise ValueError("You have manually specified the port when `auto_port=True`.")
|
||||
|
||||
# No need for a port if using unix sockets
|
||||
if "--uds" not in vllm_serve_args:
|
||||
# Don't mutate the input args
|
||||
vllm_serve_args = vllm_serve_args + [
|
||||
"--port", str(get_open_port())
|
||||
]
|
||||
vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
|
||||
if seed is not None:
|
||||
if "--seed" in vllm_serve_args:
|
||||
raise ValueError("You have manually specified the seed "
|
||||
f"when `seed={seed}`.")
|
||||
raise ValueError(f"You have manually specified the seed when `seed={seed}`.")
|
||||
|
||||
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
|
||||
|
||||
if override_hf_configs is not None:
|
||||
vllm_serve_args = vllm_serve_args + [
|
||||
"--hf-overrides",
|
||||
json.dumps(override_hf_configs)
|
||||
]
|
||||
vllm_serve_args = vllm_serve_args + ["--hf-overrides", json.dumps(override_hf_configs)]
|
||||
|
||||
self.host = str(server_host)
|
||||
self.port = int(server_port)
|
||||
@@ -303,9 +292,7 @@ class RemoteOpenAIServer:
|
||||
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
|
||||
self._wait_for_server_pd(timeout=max_wait_seconds)
|
||||
else:
|
||||
self._wait_for_multiple_servers(
|
||||
[(self.host, self.url_for("health"))],
|
||||
timeout=max_wait_seconds)
|
||||
self._wait_for_multiple_servers([(self.host, self.url_for("health"))], timeout=max_wait_seconds)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
@@ -313,7 +300,7 @@ class RemoteOpenAIServer:
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._terminate_server()
|
||||
|
||||
def _poll(self) -> Optional[int]:
|
||||
def _poll(self) -> int | None:
|
||||
"""Subclasses override this method to customize process polling"""
|
||||
return self.proc.poll()
|
||||
|
||||
@@ -345,24 +332,23 @@ class RemoteOpenAIServer:
|
||||
def url_health(ip: str, port: int) -> str:
|
||||
return f"http://{ip}:{port}/health"
|
||||
|
||||
targets = [(node_info.ip, url_health(node_info.ip, self.port))
|
||||
for node_info in self.nodes_info if not node_info.headless]
|
||||
targets = [
|
||||
(node_info.ip, url_health(node_info.ip, self.port))
|
||||
for node_info in self.nodes_info
|
||||
if not node_info.headless
|
||||
]
|
||||
|
||||
# Wait for proxy ready
|
||||
master_node = self.nodes_info[0]
|
||||
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
|
||||
|
||||
# Wait for master node proxy first
|
||||
self._wait_for_multiple_servers([(master_node.ip, url_proxy)],
|
||||
timeout=timeout)
|
||||
self._wait_for_multiple_servers([(master_node.ip, url_proxy)], timeout=timeout)
|
||||
|
||||
# Then wait for all api_server nodes
|
||||
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
||||
|
||||
def _wait_for_multiple_servers(self,
|
||||
targets,
|
||||
timeout: float,
|
||||
log_interval: float = 30.0):
|
||||
def _wait_for_multiple_servers(self, targets, timeout: float, log_interval: float = 30.0):
|
||||
"""
|
||||
targets: List[(node_ip, url)]
|
||||
log_interval
|
||||
@@ -396,9 +382,7 @@ class RemoteOpenAIServer:
|
||||
# check unexpected exit
|
||||
result = self._poll()
|
||||
if result is not None and result != 0:
|
||||
raise RuntimeError(
|
||||
f"Server at {node_ip} exited unexpectedly."
|
||||
) from None
|
||||
raise RuntimeError(f"Server at {node_ip} exited unexpectedly.") from None
|
||||
|
||||
if should_log:
|
||||
last_log_time = now
|
||||
@@ -444,35 +428,31 @@ class RemoteOpenAIServer:
|
||||
def get_async_client(self, **kwargs):
|
||||
if "timeout" not in kwargs:
|
||||
kwargs["timeout"] = 600
|
||||
return openai.AsyncOpenAI(base_url=self.url_for("v1"),
|
||||
api_key=self.DUMMY_API_KEY,
|
||||
max_retries=0,
|
||||
**kwargs)
|
||||
return openai.AsyncOpenAI(base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs)
|
||||
|
||||
|
||||
class RemoteEPDServer(RemoteOpenAIServer):
|
||||
def _start_server(self, model: str, server_cmd: list[str],
|
||||
env_dict: Optional[dict[str, str]]) -> None:
|
||||
"""Subclasses override this method to customize server process launch
|
||||
"""
|
||||
def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
|
||||
"""Subclasses override this method to customize server process launch"""
|
||||
raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
|
||||
|
||||
def __init__(self,
|
||||
vllm_serve_args: Union[list[str], list[list[str]]],
|
||||
server_host: str = '0.0.0.0',
|
||||
env_dict: Optional[dict[str, str]] = None,
|
||||
max_wait_seconds: Optional[float] = 2800) -> None:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_serve_args: list[str] | list[list[str]],
|
||||
server_host: str = "0.0.0.0",
|
||||
env_dict: dict[str, str] | None = None,
|
||||
max_wait_seconds: float | None = 2800,
|
||||
) -> None:
|
||||
self._proc_list = []
|
||||
|
||||
self.env_dict: dict[str, str] = {}
|
||||
if env_dict is not None:
|
||||
self.env_dict.update(env_dict)
|
||||
|
||||
self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1"
|
||||
self.env_dict['VLLM_USE_V1'] = "1"
|
||||
self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True"
|
||||
self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
self.env_dict["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
|
||||
self.env_dict["VLLM_USE_V1"] = "1"
|
||||
self.env_dict["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
|
||||
self.env_dict["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
self.vllm_serve_args_list = []
|
||||
self.health_url_list = []
|
||||
@@ -484,8 +464,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
||||
self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
|
||||
else:
|
||||
self.vllm_serve_args_list = [
|
||||
[str(arg) for arg in sublist]
|
||||
for sublist in copy.deepcopy(vllm_serve_args)
|
||||
[str(arg) for arg in sublist] for sublist in copy.deepcopy(vllm_serve_args)
|
||||
]
|
||||
else:
|
||||
raise RuntimeError("vllm_serves_args must be a list")
|
||||
@@ -493,7 +472,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
||||
serve_arg_cmd = ["vllm", "serve"]
|
||||
|
||||
for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
|
||||
self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i)
|
||||
self.env_dict["ASCEND_RT_VISIBLE_DEVICES"] = str(i)
|
||||
if isinstance(vllm_serve_arg, list):
|
||||
if "--port" not in vllm_serve_arg:
|
||||
raise ValueError("You have manually specified the port ")
|
||||
@@ -514,16 +493,13 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
||||
|
||||
self.health_url_list.append(super().url_for("health"))
|
||||
vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
|
||||
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict,
|
||||
f"[VLLM_{i}] ")
|
||||
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict, f"[VLLM_{i}] ")
|
||||
self._proc_list.append(proc)
|
||||
|
||||
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
||||
super()._wait_for_multiple_servers([(self.host, url)
|
||||
for url in self.health_url_list],
|
||||
timeout=timeout_value)
|
||||
super()._wait_for_multiple_servers([(self.host, url) for url in self.health_url_list], timeout=timeout_value)
|
||||
|
||||
def _poll(self) -> Optional[int]:
|
||||
def _poll(self) -> int | None:
|
||||
return None
|
||||
|
||||
def _delete_shm(self) -> None:
|
||||
@@ -542,31 +518,23 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
||||
def _read_output(self, pipe, prefix):
|
||||
try:
|
||||
with pipe:
|
||||
for line in iter(pipe.readline, ''):
|
||||
for line in iter(pipe.readline, ""):
|
||||
if line:
|
||||
print(f"{prefix}: {line}", end='')
|
||||
print(f"{prefix}: {line}", end="")
|
||||
|
||||
except Exception as e:
|
||||
print(f"error: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
def _start_server_with_prefix(self, server_cmd: list[str],
|
||||
env_dict: Optional[dict[str, str]], log_prefix: str):
|
||||
def _start_server_with_prefix(self, server_cmd: list[str], env_dict: dict[str, str] | None, log_prefix: str):
|
||||
env = os.environ.copy()
|
||||
if env_dict is not None:
|
||||
env.update(env_dict)
|
||||
proc = subprocess.Popen(server_cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True,
|
||||
bufsize=1)
|
||||
stdout_thread = threading.Thread(target=self._read_output,
|
||||
args=(proc.stdout, log_prefix),
|
||||
daemon=True)
|
||||
stderr_thread = threading.Thread(target=self._read_output,
|
||||
args=(proc.stderr, log_prefix),
|
||||
daemon=True)
|
||||
proc = subprocess.Popen(
|
||||
server_cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, bufsize=1
|
||||
)
|
||||
stdout_thread = threading.Thread(target=self._read_output, args=(proc.stdout, log_prefix), daemon=True)
|
||||
stderr_thread = threading.Thread(target=self._read_output, args=(proc.stderr, log_prefix), daemon=True)
|
||||
|
||||
stdout_thread.start()
|
||||
stderr_thread.start()
|
||||
@@ -579,27 +547,21 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
||||
parent = psutil.Process(proc.pid)
|
||||
children = parent.children(recursive=True)
|
||||
for child in children:
|
||||
try:
|
||||
with contextlib.suppress(psutil.NoSuchProcess):
|
||||
child.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
gone, still_alive = psutil.wait_procs(children, timeout=10)
|
||||
|
||||
for child in still_alive:
|
||||
try:
|
||||
with contextlib.suppress(psutil.NoSuchProcess):
|
||||
child.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
try:
|
||||
parent.terminate()
|
||||
parent.wait(timeout=10)
|
||||
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
||||
try:
|
||||
with contextlib.suppress(psutil.NoSuchProcess):
|
||||
parent.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry point."""
|
||||
@@ -611,13 +573,13 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
||||
|
||||
|
||||
class DisaggEpdProxy(RemoteEPDServer):
|
||||
|
||||
def __init__(self,
|
||||
proxy_args: Optional[Union[list[str], str]] = None,
|
||||
env_dict: Optional[dict[str, str]] = None,
|
||||
server_host: str = '0.0.0.0',
|
||||
max_wait_seconds: Optional[float] = 2800) -> None:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
proxy_args: list[str] | str | None = None,
|
||||
env_dict: dict[str, str] | None = None,
|
||||
server_host: str = "0.0.0.0",
|
||||
max_wait_seconds: float | None = 2800,
|
||||
) -> None:
|
||||
if proxy_args is None:
|
||||
proxy_args_list: list[str] = []
|
||||
elif isinstance(proxy_args, str):
|
||||
@@ -648,8 +610,7 @@ class DisaggEpdProxy(RemoteEPDServer):
|
||||
self.port = int(port_str)
|
||||
|
||||
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
||||
super()._wait_for_multiple_servers(
|
||||
[(self.host, super().url_for("health"))], timeout=timeout_value)
|
||||
super()._wait_for_multiple_servers([(self.host, super().url_for("health"))], timeout=timeout_value)
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry point."""
|
||||
@@ -661,23 +622,22 @@ class DisaggEpdProxy(RemoteEPDServer):
|
||||
|
||||
|
||||
class VllmRunner:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
runner: RunnerOption = "auto",
|
||||
convert: ConvertOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
tokenizer_name: str | None = None,
|
||||
tokenizer_mode: str = "auto",
|
||||
max_model_len: Optional[int] = 1024,
|
||||
max_model_len: int | None = 1024,
|
||||
dtype: str = "auto",
|
||||
disable_log_stats: bool = True,
|
||||
tensor_parallel_size: int = 1,
|
||||
block_size: int = 16,
|
||||
enable_chunked_prefill: bool = True,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: Optional[bool] = False,
|
||||
quantization: Optional[str] = None,
|
||||
enforce_eager: bool | None = False,
|
||||
quantization: str | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.model = LLM(
|
||||
@@ -701,17 +661,13 @@ class VllmRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor], list[int]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
prompts: list[str] | list[torch.Tensor] | list[int],
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
) -> list[TextPrompt]:
|
||||
|
||||
if any(x is not None and len(x) != len(prompts)
|
||||
for x in [images, videos, audios]):
|
||||
raise ValueError(
|
||||
"All non-None multimodal inputs must have the same length as "
|
||||
"prompts")
|
||||
if any(x is not None and len(x) != len(prompts) for x in [images, videos, audios]):
|
||||
raise ValueError("All non-None multimodal inputs must have the same length as prompts")
|
||||
|
||||
inputs = []
|
||||
for i, prompt in enumerate(prompts):
|
||||
@@ -719,13 +675,11 @@ class VllmRunner:
|
||||
if images is not None and (image := images[i]) is not None:
|
||||
multi_modal_data["image"] = image
|
||||
if videos is not None and (video := videos[i]) is not None:
|
||||
multi_modal_data["video"] = video # type: ignore
|
||||
multi_modal_data["video"] = video # type: ignore
|
||||
if audios is not None and (audio := audios[i]) is not None:
|
||||
multi_modal_data["audio"] = audio # type: ignore
|
||||
multi_modal_data["audio"] = audio # type: ignore
|
||||
|
||||
text_prompt_kwargs: dict[str, Any] = {
|
||||
"multi_modal_data": multi_modal_data or None
|
||||
}
|
||||
text_prompt_kwargs: dict[str, Any] = {"multi_modal_data": multi_modal_data or None}
|
||||
if isinstance(prompt, str):
|
||||
text_prompt_kwargs["prompt"] = prompt
|
||||
elif isinstance(prompt, list):
|
||||
@@ -739,21 +693,16 @@ class VllmRunner:
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
prompts: list[str] | list[torch.Tensor],
|
||||
sampling_params: SamplingParams,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[list[list[int]], list[str]]]:
|
||||
inputs = self.get_inputs(prompts,
|
||||
images=images,
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
|
||||
req_outputs = self.model.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**kwargs)
|
||||
req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)
|
||||
|
||||
outputs: list[tuple[list[list[int]], list[str]]] = []
|
||||
for req_output in req_outputs:
|
||||
@@ -780,99 +729,83 @@ class VllmRunner:
|
||||
output_str = sample.text
|
||||
output_ids = list(sample.token_ids)
|
||||
output_logprobs = sample.logprobs
|
||||
outputs.append((output_ids, output_str, output_logprobs,
|
||||
req_output.prompt_logprobs))
|
||||
outputs.append((output_ids, output_str, output_logprobs, req_output.prompt_logprobs))
|
||||
return outputs
|
||||
|
||||
def generate_w_logprobs(
|
||||
self,
|
||||
prompts: list[str],
|
||||
sampling_params: SamplingParams,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
inputs = self.get_inputs(prompts,
|
||||
images=images,
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
|
||||
req_outputs = self.model.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**kwargs)
|
||||
req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)
|
||||
|
||||
toks_str_logsprobs_prompt_logprobs = (
|
||||
self._final_steps_generate_w_logprobs(req_outputs))
|
||||
toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(req_outputs)
|
||||
# Omit prompt logprobs if not required by sampling params
|
||||
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
return (
|
||||
[x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
||||
if sampling_params.prompt_logprobs is None
|
||||
else toks_str_logsprobs_prompt_logprobs
|
||||
)
|
||||
|
||||
def generate_greedy(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
prompts: list[str] | list[torch.Tensor],
|
||||
max_tokens: int,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[list[int], str]]:
|
||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
||||
outputs = self.generate(prompts,
|
||||
greedy_params,
|
||||
images=images,
|
||||
videos=videos,
|
||||
audios=audios,
|
||||
**kwargs)
|
||||
return [(output_ids[0], output_str[0])
|
||||
for output_ids, output_str in outputs]
|
||||
outputs = self.generate(prompts, greedy_params, images=images, videos=videos, audios=audios, **kwargs)
|
||||
return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
|
||||
|
||||
def generate_greedy_logprobs(
|
||||
self,
|
||||
prompts: list[str],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
stop: Optional[list[str]] = None,
|
||||
num_logprobs: int | None,
|
||||
num_prompt_logprobs: int | None = None,
|
||||
images: PromptImageInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
stop: list[str] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||
greedy_logprobs_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=max_tokens,
|
||||
logprobs=num_logprobs,
|
||||
prompt_logprobs=num_prompt_logprobs,
|
||||
stop_token_ids=stop_token_ids,
|
||||
stop=stop)
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
return self.generate_w_logprobs(prompts,
|
||||
greedy_logprobs_params,
|
||||
images=images,
|
||||
audios=audios,
|
||||
videos=videos,
|
||||
**kwargs)
|
||||
return self.generate_w_logprobs(
|
||||
prompts, greedy_logprobs_params, images=images, audios=audios, videos=videos, **kwargs
|
||||
)
|
||||
|
||||
def classify(self, prompts: list[str]) -> list[list[float]]:
|
||||
req_outputs = self.model.classify(prompts)
|
||||
return [req_output.outputs.probs for req_output in req_outputs]
|
||||
|
||||
def embed(self,
|
||||
prompts: list[str],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
*args,
|
||||
**kwargs) -> list[list[float]]:
|
||||
inputs = self.get_inputs(prompts,
|
||||
images=images,
|
||||
videos=videos,
|
||||
audios=audios)
|
||||
def embed(
|
||||
self,
|
||||
prompts: list[str],
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> list[list[float]]:
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
|
||||
req_outputs = self.model.embed(inputs, *args, **kwargs)
|
||||
return [req_output.outputs.embedding for req_output in req_outputs]
|
||||
@@ -887,8 +820,8 @@ class VllmRunner:
|
||||
|
||||
def score(
|
||||
self,
|
||||
text_1: Union[str, list[str]],
|
||||
text_2: Union[str, list[str]],
|
||||
text_1: str | list[str],
|
||||
text_2: str | list[str],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> list[float]:
|
||||
@@ -905,14 +838,11 @@ class VllmRunner:
|
||||
|
||||
|
||||
class HfRunner:
|
||||
|
||||
def get_default_device(self):
|
||||
return "cpu" if current_platform.is_cpu() else current_platform.device_type
|
||||
|
||||
return ("cpu"
|
||||
if current_platform.is_cpu() else current_platform.device_type)
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
if x is None or isinstance(x, (bool, )):
|
||||
def wrap_device(self, x: _T, device: str | None = None) -> _T:
|
||||
if x is None or isinstance(x, (bool,)):
|
||||
return x
|
||||
|
||||
if device is None:
|
||||
@@ -931,7 +861,7 @@ class HfRunner:
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
model_kwargs: dict[str, Any] | None = None,
|
||||
trust_remote_code: bool = True,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
@@ -984,14 +914,15 @@ class HfRunner:
|
||||
)
|
||||
|
||||
# in case some unquantized custom models are not in same dtype
|
||||
if (getattr(model, "quantization_method", None) is None
|
||||
and any(p.dtype != self.dtype
|
||||
for p in model.parameters())):
|
||||
if getattr(model, "quantization_method", None) is None and any(
|
||||
p.dtype != self.dtype for p in model.parameters()
|
||||
):
|
||||
model = model.to(dtype=self.dtype)
|
||||
|
||||
if (getattr(model, "quantization_method", None) != "bitsandbytes"
|
||||
and len({p.device
|
||||
for p in model.parameters()}) < 2):
|
||||
if (
|
||||
getattr(model, "quantization_method", None) != "bitsandbytes"
|
||||
and len({p.device for p in model.parameters()}) < 2
|
||||
):
|
||||
model = model.to(device=self.device)
|
||||
|
||||
self.model = model
|
||||
@@ -1006,6 +937,7 @@ class HfRunner:
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
@@ -1017,10 +949,10 @@ class HfRunner:
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: list[str],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
) -> list[Union[BatchFeature, BatchEncoding]]:
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
) -> list[BatchFeature | BatchEncoding]:
|
||||
if images is not None:
|
||||
assert len(prompts) == len(images)
|
||||
|
||||
@@ -1030,7 +962,7 @@ class HfRunner:
|
||||
if audios is not None:
|
||||
assert len(prompts) == len(audios)
|
||||
|
||||
all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
|
||||
all_inputs: list[BatchFeature | BatchEncoding] = []
|
||||
for i, prompt in enumerate(prompts):
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
"text": prompt,
|
||||
@@ -1076,16 +1008,11 @@ class HfRunner:
|
||||
|
||||
return outputs
|
||||
|
||||
def encode(self, prompts: list[str], *args,
|
||||
**kwargs) -> list[list[torch.Tensor]]:
|
||||
def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]:
|
||||
return self.model.encode(prompts, *args, **kwargs)
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args,
|
||||
**kwargs) -> torch.Tensor:
|
||||
return self.model.predict(prompts,
|
||||
*args,
|
||||
convert_to_tensor=True,
|
||||
**kwargs)
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
return self.model.predict(prompts, *args, convert_to_tensor=True, **kwargs)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
@@ -1103,22 +1030,25 @@ def ilama_lora_files():
|
||||
@pytest.fixture(scope="session")
|
||||
def llama32_lora_files():
|
||||
from huggingface_hub import snapshot_download as hf_snapshot_download
|
||||
|
||||
return hf_snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider", local_files_only=True)
|
||||
|
||||
|
||||
def qwen_prompt(questions: list[str]) -> list[str]:
|
||||
placeholder = "<|image_pad|>"
|
||||
return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
|
||||
return [
|
||||
(
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{q}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
for q in questions
|
||||
]
|
||||
|
||||
|
||||
def hunyuan_prompt(questions: list[str]) -> list[str]:
|
||||
placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
|
||||
return [
|
||||
f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>"
|
||||
for question in questions
|
||||
]
|
||||
return [f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" for question in questions]
|
||||
|
||||
|
||||
PROMPT_CONFIGS = {
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
|
||||
#
|
||||
|
||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
from collections.abc import Sequence
|
||||
|
||||
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||
|
||||
TokensText = Tuple[List[int], str]
|
||||
TokensText = tuple[list[int], str]
|
||||
|
||||
|
||||
def check_outputs_equal(
|
||||
@@ -37,18 +37,18 @@ def check_outputs_equal(
|
||||
"""
|
||||
assert len(outputs_0_lst) == len(outputs_1_lst)
|
||||
|
||||
for prompt_idx, (outputs_0,
|
||||
outputs_1) in enumerate(zip(outputs_0_lst,
|
||||
outputs_1_lst)):
|
||||
for prompt_idx, (outputs_0, outputs_1) in enumerate(zip(outputs_0_lst, outputs_1_lst)):
|
||||
output_ids_0, output_str_0 = outputs_0
|
||||
output_ids_1, output_str_1 = outputs_1
|
||||
|
||||
# The text and token outputs should exactly match
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{output_str_0!r}"
|
||||
f"\n{name_1}:\t{output_str_1!r}"
|
||||
f"\n{name_0}:\t{output_ids_0!r}"
|
||||
f"\n{name_1}:\t{output_ids_1!r}")
|
||||
fail_msg = (
|
||||
f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{output_str_0!r}"
|
||||
f"\n{name_1}:\t{output_str_1!r}"
|
||||
f"\n{name_0}:\t{output_ids_0!r}"
|
||||
f"\n{name_1}:\t{output_ids_1!r}"
|
||||
)
|
||||
|
||||
assert output_str_0 == output_str_1, fail_msg
|
||||
assert output_ids_0 == output_ids_1, fail_msg
|
||||
@@ -60,9 +60,7 @@ def check_outputs_equal(
|
||||
# * List of top sample logprobs for each sampled token
|
||||
#
|
||||
# Assumes prompt logprobs were not requested.
|
||||
TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
|
||||
float]],
|
||||
SampleLogprobs]]]
|
||||
TokensTextLogprobs = tuple[list[int], str, list[dict[int, float]] | SampleLogprobs | None]
|
||||
|
||||
# Representation of generated sequence as a tuple of
|
||||
# * Token ID list
|
||||
@@ -71,6 +69,9 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
|
||||
# * Optional list of top prompt logprobs for each prompt token
|
||||
#
|
||||
# Allows prompt logprobs to be requested.
|
||||
TokensTextLogprobsPromptLogprobs = Tuple[
|
||||
List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
|
||||
Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
|
||||
TokensTextLogprobsPromptLogprobs = tuple[
|
||||
list[int],
|
||||
str,
|
||||
list[dict[int, float]] | SampleLogprobs | None,
|
||||
list[dict[int, float] | None] | PromptLogprobs | None,
|
||||
]
|
||||
|
||||
@@ -55,16 +55,12 @@ def report_dir(pytestconfig):
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
if "config_filename" in metafunc.fixturenames:
|
||||
|
||||
if metafunc.config.getoption("--config-list-file"):
|
||||
rel_path = metafunc.config.getoption("--config-list-file")
|
||||
config_list_file = Path(rel_path).resolve()
|
||||
config_dir = config_list_file.parent
|
||||
with open(config_list_file, encoding="utf-8") as f:
|
||||
configs = [
|
||||
config_dir / line.strip() for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
configs = [config_dir / line.strip() for line in f if line.strip() and not line.startswith("#")]
|
||||
metafunc.parametrize("config_filename", configs)
|
||||
else:
|
||||
single_config = metafunc.config.getoption("--config")
|
||||
|
||||
@@ -24,16 +24,15 @@ class EnvConfig:
|
||||
|
||||
@pytest.fixture
|
||||
def env_config() -> EnvConfig:
|
||||
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
|
||||
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
|
||||
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
|
||||
'unknown'),
|
||||
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
|
||||
'unknown'),
|
||||
cann_version=os.getenv('CANN_VERSION', 'unknown'),
|
||||
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
|
||||
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
|
||||
'unknown'))
|
||||
return EnvConfig(
|
||||
vllm_version=os.getenv("VLLM_VERSION", "unknown"),
|
||||
vllm_commit=os.getenv("VLLM_COMMIT", "unknown"),
|
||||
vllm_ascend_version=os.getenv("VLLM_ASCEND_VERSION", "unknown"),
|
||||
vllm_ascend_commit=os.getenv("VLLM_ASCEND_COMMIT", "unknown"),
|
||||
cann_version=os.getenv("CANN_VERSION", "unknown"),
|
||||
torch_version=os.getenv("TORCH_VERSION", "unknown"),
|
||||
torch_npu_version=os.getenv("TORCH_NPU_VERSION", "unknown"),
|
||||
)
|
||||
|
||||
|
||||
def build_model_args(eval_config, tp_size):
|
||||
@@ -48,9 +47,13 @@ def build_model_args(eval_config, tp_size):
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
for s in [
|
||||
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
|
||||
"tensor_parallel_size", "enforce_eager", "enable_thinking",
|
||||
"quantization"
|
||||
"max_images",
|
||||
"gpu_memory_utilization",
|
||||
"enable_expert_parallel",
|
||||
"tensor_parallel_size",
|
||||
"enforce_eager",
|
||||
"enable_thinking",
|
||||
"quantization",
|
||||
]:
|
||||
val = eval_config.get(s, None)
|
||||
if val is not None:
|
||||
@@ -68,7 +71,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
model_args = build_model_args(eval_config, tp_size)
|
||||
|
||||
parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
|
||||
if model_args.get('enable_expert_parallel', False):
|
||||
if model_args.get("enable_expert_parallel", False):
|
||||
parallel_mode += " + EP"
|
||||
|
||||
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
|
||||
@@ -93,17 +96,16 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||
rows=report_data["rows"],
|
||||
parallel_mode=parallel_mode,
|
||||
execution_model=execution_model)
|
||||
execution_model=execution_model,
|
||||
)
|
||||
|
||||
report_output = os.path.join(
|
||||
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
||||
report_output = os.path.join(report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
||||
os.makedirs(os.path.dirname(report_output), exist_ok=True)
|
||||
with open(report_output, 'w', encoding='utf-8') as f:
|
||||
with open(report_output, "w", encoding="utf-8") as f:
|
||||
f.write(report_content)
|
||||
|
||||
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
||||
env_config):
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size, report_dir, env_config):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
model_args = build_model_args(eval_config, tp_size)
|
||||
success = True
|
||||
@@ -135,25 +137,26 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
||||
metric_name = metric["name"]
|
||||
ground_truth = metric["value"]
|
||||
measured_value = round(task_result[metric_name], 4)
|
||||
task_success = bool(
|
||||
np.isclose(ground_truth, measured_value, rtol=RTOL))
|
||||
task_success = bool(np.isclose(ground_truth, measured_value, rtol=RTOL))
|
||||
success = success and task_success
|
||||
|
||||
print(f"{task_name} | {metric_name}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value} | "
|
||||
f"success={'✅' if task_success else '❌'}")
|
||||
print(
|
||||
f"{task_name} | {metric_name}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value} | "
|
||||
f"success={'✅' if task_success else '❌'}"
|
||||
)
|
||||
|
||||
report_data["rows"].append({
|
||||
"task":
|
||||
task_name,
|
||||
"metric":
|
||||
metric_name,
|
||||
"value":
|
||||
f"✅{measured_value}" if success else f"❌{measured_value}",
|
||||
"stderr":
|
||||
task_result[
|
||||
metric_name.replace(',', '_stderr,') if metric_name ==
|
||||
"acc,none" else metric_name.replace(',', '_stderr,')]
|
||||
})
|
||||
report_data["rows"].append(
|
||||
{
|
||||
"task": task_name,
|
||||
"metric": metric_name,
|
||||
"value": f"✅{measured_value}" if success else f"❌{measured_value}",
|
||||
"stderr": task_result[
|
||||
metric_name.replace(",", "_stderr,")
|
||||
if metric_name == "acc,none"
|
||||
else metric_name.replace(",", "_stderr,")
|
||||
],
|
||||
}
|
||||
)
|
||||
generate_report(tp_size, eval_config, report_data, report_dir, env_config)
|
||||
assert success
|
||||
|
||||
@@ -18,15 +18,12 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from typing import Any, Union
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.v1.metrics.reader import Counter, Vector
|
||||
|
||||
@@ -101,7 +98,8 @@ def test_eagle3_sp_acceptance(
|
||||
[prompt],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
) for prompt in prompts
|
||||
)
|
||||
for prompt in prompts
|
||||
]
|
||||
|
||||
speculative_config = {
|
||||
@@ -112,21 +110,20 @@ def test_eagle3_sp_acceptance(
|
||||
"model": spec_model_name,
|
||||
}
|
||||
|
||||
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",
|
||||
cudagraph_capture_sizes=[12])
|
||||
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", cudagraph_capture_sizes=[12])
|
||||
|
||||
with VllmRunner(
|
||||
main_model_name,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192,
|
||||
disable_log_stats=False,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=256,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=0.7,
|
||||
speculative_config=speculative_config,
|
||||
compilation_config=compilation_config,
|
||||
async_scheduling=async_scheduling,
|
||||
main_model_name,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192,
|
||||
disable_log_stats=False,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=256,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=0.7,
|
||||
speculative_config=speculative_config,
|
||||
compilation_config=compilation_config,
|
||||
async_scheduling=async_scheduling,
|
||||
) as llm:
|
||||
_ = llm.generate(prompts, sampling_params)
|
||||
metrics = llm.model.get_metrics()
|
||||
@@ -142,10 +139,7 @@ def test_eagle3_sp_acceptance(
|
||||
for pos in range(len(metric.values)):
|
||||
num_accepted_tokens_per_pos[pos] += metric.values[pos]
|
||||
|
||||
acceptance_per_pos = [
|
||||
num_accepted_tokens / num_drafts
|
||||
for num_accepted_tokens in num_accepted_tokens_per_pos
|
||||
]
|
||||
acceptance_per_pos = [num_accepted_tokens / num_drafts for num_accepted_tokens in num_accepted_tokens_per_pos]
|
||||
golden = BASELINES_SP[method]
|
||||
|
||||
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
|
||||
|
||||
@@ -25,8 +25,8 @@ import pytest
|
||||
import torch
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
|
||||
MODELS = [
|
||||
# Offline data parallel mode will be not supported/useful for dense models
|
||||
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
|
||||
]
|
||||
|
||||
for cls, method, counter in hooks:
|
||||
stack.enter_context(
|
||||
patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
|
||||
return stack
|
||||
|
||||
@@ -75,18 +74,19 @@ def _run_worker_process(
|
||||
max_tokens: int,
|
||||
):
|
||||
"""Main entry point for the worker process."""
|
||||
os.environ.update({
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
})
|
||||
os.environ.update(
|
||||
{
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
}
|
||||
)
|
||||
|
||||
# Import vLLM only after environment setup
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
|
||||
|
||||
# Apply hooks and run inference
|
||||
with _install_spies(counters):
|
||||
@@ -100,23 +100,20 @@ def _run_worker_process(
|
||||
# Simple data sharding
|
||||
chunk_size = len(prompts) // world_size
|
||||
start_idx = rank * chunk_size
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
|
||||
prompts)
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
|
||||
local_prompts = prompts[start_idx:end_idx]
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
quantization="ascend" if "W8A8" in model_path else None,
|
||||
enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
||||
enable_expert_parallel="DeepSeek" in model_path,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# Expose model config to the main test process
|
||||
counters["hidden_layers"].value = (
|
||||
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
|
||||
counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers
|
||||
|
||||
llm.generate(local_prompts,
|
||||
SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
|
||||
# Explicit cleanup is mandatory in multi-process vLLM tests
|
||||
del llm
|
||||
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
for rank in range(dp_size):
|
||||
p = multiprocessing.Process(
|
||||
target=_run_worker_process,
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
|
||||
max_tokens),
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
|
||||
)
|
||||
p.start()
|
||||
workers.append(p)
|
||||
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
for k in workers:
|
||||
if k.is_alive():
|
||||
k.kill()
|
||||
raise RuntimeError(
|
||||
f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
|
||||
actual_capture = counters["capture"].value
|
||||
actual_replay = counters["replay"].value
|
||||
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
num_layers = counters["hidden_layers"].value
|
||||
|
||||
num_acl_graphs = num_layers + 1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1]
|
||||
if s > 1) # dp_size=2, tp_size=1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1) # dp_size=2, tp_size=1
|
||||
|
||||
# Metric 1: Graph Capture (ACL Graph Construction)
|
||||
# Ref: vllm_ascend.utils.update_aclgraph_sizes
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
|
||||
num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
|
||||
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
|
||||
assert (
|
||||
actual_capture == expected_capture
|
||||
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
assert actual_capture == expected_capture, (
|
||||
f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
)
|
||||
|
||||
# Metric 2: Model Execution (NPUModelRunner.execute_model)
|
||||
# vLLM Step Breakdown:
|
||||
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
# vllm default enables Async scheduler, this will take 1 more steps
|
||||
expected_exec_model = (total_steps + 1 + 1) * dp_size
|
||||
|
||||
assert (
|
||||
num_execute_model == expected_exec_model
|
||||
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
assert num_execute_model == expected_exec_model, (
|
||||
f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
)
|
||||
|
||||
# Metric 3: Dummy Runs (Warmup & Alignment)
|
||||
# vLLM synchronizes globally every 32 steps.
|
||||
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
|
||||
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
|
||||
|
||||
assert (
|
||||
num_dummy_run == expected_dummy_run
|
||||
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
assert num_dummy_run == expected_dummy_run, (
|
||||
f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
)
|
||||
|
||||
# Metric 4: Graph Replay (Inference Execution)
|
||||
# Replays happen for every aligned step across all graphs.
|
||||
expected_replay = num_acl_graphs * aligned_steps * dp_size
|
||||
|
||||
assert (
|
||||
actual_replay == expected_replay
|
||||
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
|
||||
@@ -64,12 +64,8 @@ def test_qwen3_inference_dp2(model, max_tokens):
|
||||
cmd.append("ascend")
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=600)
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ MODELS = [
|
||||
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
||||
TENSOR_PARALLELS = [1]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@@ -36,36 +37,61 @@ async def test_models(model: str, tp_size: int) -> None:
|
||||
vllm_server_args = [
|
||||
[
|
||||
"--port",
|
||||
str(encode_port), "--model", model, "--gpu-memory-utilization",
|
||||
"0.01", "--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
|
||||
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
|
||||
"--max-num-seqs", "1", "--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
||||
str(encode_port),
|
||||
"--model",
|
||||
model,
|
||||
"--gpu-memory-utilization",
|
||||
"0.01",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--enforce-eager",
|
||||
"--no-enable-prefix-caching",
|
||||
"--max-model-len",
|
||||
"10000",
|
||||
"--max-num-batched-tokens",
|
||||
"10000",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"'
|
||||
+ SHARED_STORAGE_PATH
|
||||
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}',
|
||||
],
|
||||
[
|
||||
"--port",
|
||||
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
|
||||
str(pd_port),
|
||||
"--model",
|
||||
model,
|
||||
"--gpu-memory-utilization",
|
||||
"0.95",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
|
||||
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
|
||||
str(tp_size),
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"10000",
|
||||
"--max-num-batched-tokens",
|
||||
"10000",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
||||
]
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"'
|
||||
+ SHARED_STORAGE_PATH
|
||||
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}',
|
||||
],
|
||||
]
|
||||
proxy_port = get_open_port()
|
||||
proxy_args = [
|
||||
"--host", "127.0.0.1", "--port",
|
||||
str(proxy_port), "--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}", "--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
str(proxy_port),
|
||||
"--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}",
|
||||
"--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}",
|
||||
"--prefill-servers-urls",
|
||||
"disable",
|
||||
]
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
|
||||
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
||||
send_image_request(model, proxy)
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _, DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
||||
send_image_request(model, proxy)
|
||||
|
||||
@@ -15,15 +15,12 @@ def test_deepseek_correctness_ep(model_name):
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(model_name,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
tensor_parallel_size=2) as vllm_model:
|
||||
with VllmRunner(model_name, cudagraph_capture_sizes=[1, 2, 4, 8], tensor_parallel_size=2) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
model_name, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], enable_expert_parallel=True
|
||||
) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -29,6 +29,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
import torch_npu
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
@wait_until_npu_memory_free()
|
||||
def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
|
||||
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
|
||||
"--enable-expert-parallel"
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-expert-parallel",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
@wait_until_npu_memory_free()
|
||||
def test_qwen3_external_launcher_with_sleepmode():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
model_path = snapshot_download("Qwen/Qwen3-8B")
|
||||
# TODO: Add moe model test
|
||||
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@wait_until_npu_memory_free()
|
||||
@patch.dict(os.environ, {
|
||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
|
||||
"HCCL_BUFFSIZE": "500"
|
||||
})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
|
||||
@@ -26,41 +26,39 @@ from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
|
||||
def test_qwen3_moe_full_decode_only_tp2():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
model = "Qwen/Qwen3-30B-A3B"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
|
||||
) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_fullgraph_outputs_list = []
|
||||
for output in vllm_fullgraph_outputs:
|
||||
vllm_fullgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
@@ -72,41 +70,39 @@ def test_qwen3_moe_full_decode_only_tp2():
|
||||
|
||||
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
|
||||
def test_qwen3_moe_full_graph_tp2():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
model = "Qwen/Qwen3-30B-A3B"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
|
||||
) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_fullgraph_outputs_list = []
|
||||
for output in vllm_fullgraph_outputs:
|
||||
vllm_fullgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
|
||||
@@ -1,23 +1,22 @@
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
MODEL_PATH, do_sample)
|
||||
from tests.e2e.singlecard.test_ilama_lora import EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample
|
||||
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
||||
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
||||
with VllmRunner(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@@ -20,8 +20,10 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
|
||||
"unsloth/gpt-oss-20b-BF16",
|
||||
]
|
||||
|
||||
|
||||
def test_deepseek_multistream_moe_tp2():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -58,15 +61,15 @@ def test_deepseek_multistream_moe_tp2():
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
additional_config={
|
||||
"enable_multistream_moe": True,
|
||||
"refresh": True,
|
||||
},
|
||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
additional_config={
|
||||
"enable_multistream_moe": True,
|
||||
"refresh": True,
|
||||
},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -78,12 +81,12 @@ def test_qwen3_w4a8_dynamic_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"pass_config": {
|
||||
"enable_sp": True
|
||||
}},
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"pass_config": {"enable_sp": True}},
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
||||
def test_deepseek_w4a8_accuracy_tp2(model):
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
|
||||
]
|
||||
vllm_ds_w4a8_answers = [
|
||||
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
|
||||
]
|
||||
vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_quant_outputs = vllm_model.model.generate(prompts,
|
||||
sampling_params)
|
||||
with VllmRunner(
|
||||
model,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
) as vllm_model:
|
||||
vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_quant_outputs_list = []
|
||||
for output in vllm_quant_outputs:
|
||||
vllm_quant_outputs_list.append(
|
||||
([output.outputs[0].index], output.outputs[0].text))
|
||||
vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
|
||||
vllm_answer_list = []
|
||||
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
|
||||
vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]
|
||||
|
||||
check_outputs_equal(outputs_0_lst=vllm_answer_list,
|
||||
outputs_1_lst=vllm_quant_outputs_list,
|
||||
name_0="vllm_quant_outputs",
|
||||
name_1="vllm_answer_outputs")
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_answer_list,
|
||||
outputs_1_lst=vllm_quant_outputs_list,
|
||||
name_0="vllm_quant_outputs",
|
||||
name_1="vllm_answer_outputs",
|
||||
)
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -168,20 +168,17 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=
|
||||
True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
||||
additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
||||
additional_config={"layer_sharding": ["o_proj"]},
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
|
||||
example_prompts = [
|
||||
"test" * 1001,
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
quantization="ascend") as vllm_model:
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -213,12 +209,12 @@ def test_qwen3_dense_fc1_tp2(model):
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -232,13 +228,13 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
additional_config={"weight_prefetch_config": {"enabled": True}},
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
additional_config={"weight_prefetch_config": {"enabled": True}},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||
"Hello ",
|
||||
]
|
||||
# "max_position_embeddings": 163840,
|
||||
long_example_prompts = [
|
||||
"Hello " * (163839 - 500) + "Hello"
|
||||
]
|
||||
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
|
||||
max_tokens = 500
|
||||
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
max_model_len=163840,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
speculative_config={
|
||||
"num_speculative_tokens": 1,
|
||||
"method": "deepseek_mtp"
|
||||
},
|
||||
additional_config={
|
||||
"layer_sharding":["q_b_proj", "o_proj"]
|
||||
},
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32") as vllm_model:
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
max_model_len=163840,
|
||||
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
||||
additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
||||
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
||||
|
||||
@@ -285,10 +273,10 @@ def test_qwen3_w4a4_distributed_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -300,8 +288,8 @@ def test_gpt_oss_distributed_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=True,
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -32,9 +32,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
@@ -65,7 +63,7 @@ def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
|
||||
@@ -37,12 +37,13 @@ prompts = [
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
|
||||
def test_models_pp2(model: str, tp_size: int, pp_size: int,
|
||||
distributed_executor_backend: str) -> None:
|
||||
with VllmRunner(model,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
def test_models_pp2(model: str, tp_size: int, pp_size: int, distributed_executor_backend: str) -> None:
|
||||
with VllmRunner(
|
||||
model,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, 64)
|
||||
|
||||
@@ -11,11 +11,14 @@ MODELS = [
|
||||
# for MHA
|
||||
"Qwen/Qwen3-8B",
|
||||
# for MLA
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
]
|
||||
|
||||
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
|
||||
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
|
||||
# ruff: noqa: E501
|
||||
LONG_PROMPT = (
|
||||
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
|
||||
+ """
|
||||
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|
||||
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
|
||||
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
|
||||
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
|
||||
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
|
||||
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
|
||||
"""
|
||||
)
|
||||
|
||||
INPUT_PROMPTS = [
|
||||
LONG_PROMPT +
|
||||
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
||||
LONG_PROMPT +
|
||||
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
|
||||
LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
||||
LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
|
||||
with VllmRunner(model,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
enable_prefix_caching=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
with VllmRunner(
|
||||
model,
|
||||
enable_prefix_caching=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
@@ -27,16 +26,16 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
"The president of the United States is the head of state and",
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
@@ -50,36 +49,37 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
"The president of the United States is the head of state and",
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
def test_qwen3_moe_w4a8_dynamic_llm_compressor():
|
||||
example_prompts = [
|
||||
"The president of the United States is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
"The president of the United States is the head of state and",
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
|
||||
@@ -34,11 +34,11 @@ def test_qwen3_moe_distributed_mp_tp2_ep():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -49,27 +49,27 @@ def test_qwen3_moe_w8a8_distributed_tp2():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_qwen3_moe_distributed_aiv_tp2():
|
||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||
os.environ["HCCL_OP_EXPANSION_MODE"] = "AIV"
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
dtype = "auto"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -80,23 +80,24 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
||||
port = get_open_port()
|
||||
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
|
||||
server_args = [
|
||||
"--max_model_len", "8192", "--tensor_parallel_size", "2",
|
||||
"--enable_expert_parallel", "--quantization", "ascend", "--port",
|
||||
str(port), "--compilation-config", compilation_config
|
||||
"--max_model_len",
|
||||
"8192",
|
||||
"--tensor_parallel_size",
|
||||
"2",
|
||||
"--enable_expert_parallel",
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--port",
|
||||
str(port),
|
||||
"--compilation-config",
|
||||
compilation_config,
|
||||
]
|
||||
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
auto_port=False,
|
||||
env_dict=env_dict) as server:
|
||||
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(model=model,
|
||||
prompt="What is deeplearning?",
|
||||
max_tokens=400,
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
n=1)
|
||||
batch = await client.completions.create(
|
||||
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
|
||||
)
|
||||
gt_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
|
||||
# dynamic eplb test
|
||||
@@ -108,22 +109,14 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
||||
"dynamic_eplb": True,
|
||||
"expert_heat_collection_interval": 100,
|
||||
"algorithm_execution_interval": 20,
|
||||
"num_redundant_experts": 2
|
||||
"num_redundant_experts": 2,
|
||||
}
|
||||
}
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
auto_port=False,
|
||||
env_dict=env_dict) as server:
|
||||
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(model=model,
|
||||
prompt="What is deeplearning?",
|
||||
max_tokens=400,
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
n=1)
|
||||
batch = await client.completions.create(
|
||||
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
|
||||
)
|
||||
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert gt_choices[0].text == eplb_choices[
|
||||
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
||||
assert gt_choices[0].text == eplb_choices[0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from vllm import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
|
||||
def test_qwen3_moe_routing_replay():
|
||||
@@ -12,18 +13,15 @@ def test_qwen3_moe_routing_replay():
|
||||
"Hello, please introduce yourself.",
|
||||
]
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
enable_return_routed_experts=True,
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
enable_return_routed_experts=True,
|
||||
) as vllm_model:
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=5,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
output_kind=RequestOutputKind.FINAL_ONLY
|
||||
max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
|
||||
)
|
||||
inputs = vllm_model.get_inputs(prompts=prompts)
|
||||
outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
|
||||
|
||||
@@ -84,11 +84,7 @@ async def test_models(model: str) -> None:
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
|
||||
@@ -13,69 +13,65 @@ MODELS = [
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
|
||||
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is", "The capital of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The capital of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
) as runner:
|
||||
shared_expert_dp_eager_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
shared_expert_dp_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
) as runner:
|
||||
shared_expert_dp_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
shared_expert_dp_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
shared_expert_dp_eager_outputs_list = []
|
||||
for output in shared_expert_dp_eager_outputs:
|
||||
shared_expert_dp_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
shared_expert_dp_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
shared_expert_dp_aclgraph_outputs_list = []
|
||||
for output in shared_expert_dp_aclgraph_outputs:
|
||||
shared_expert_dp_aclgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
shared_expert_dp_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
|
||||
@@ -39,8 +39,7 @@ api_keyword_args = {
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||
async def test_models_single_request_aclgraph_dp2(model: str,
|
||||
dp_size: int) -> None:
|
||||
async def test_models_single_request_aclgraph_dp2(model: str, dp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
@@ -48,36 +47,51 @@ async def test_models_single_request_aclgraph_dp2(model: str,
|
||||
}
|
||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
||||
"--no-enable-prefix-caching",
|
||||
"--tensor-parallel-size",
|
||||
"1",
|
||||
"--data-parallel-size",
|
||||
str(dp_size), "--quantization", "ascend", "--max-model-len",
|
||||
"1024", "--port",
|
||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
||||
str(dp_size),
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--port",
|
||||
str(port),
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
]
|
||||
else:
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
||||
"--no-enable-prefix-caching",
|
||||
"--tensor-parallel-size",
|
||||
"1",
|
||||
"--data-parallel-size",
|
||||
str(dp_size), "--port",
|
||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
||||
str(dp_size),
|
||||
"--port",
|
||||
str(port),
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
vllm_serve_args=server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
with RemoteOpenAIServer(
|
||||
model, vllm_serve_args=server_args, server_port=port, env_dict=env_dict, auto_port=False
|
||||
) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
try:
|
||||
batch = await asyncio.wait_for(client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
),
|
||||
timeout=10.0)
|
||||
batch = await asyncio.wait_for(
|
||||
client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
),
|
||||
timeout=10.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
pytest.fail("Model did not return response within 10 seconds")
|
||||
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -14,47 +12,46 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_qwen3_vl_sp_tp2(model: str) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is", "The capital of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The capital of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": False}
|
||||
},
|
||||
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}}
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": False},
|
||||
},
|
||||
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||||
) as runner:
|
||||
no_sp_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": True}
|
||||
},
|
||||
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}}
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": True},
|
||||
},
|
||||
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||||
) as runner:
|
||||
sp_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
sp_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
no_sp_outputs_list = []
|
||||
for output in no_sp_outputs:
|
||||
no_sp_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
no_sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
sp_outputs_list = []
|
||||
for output in sp_outputs:
|
||||
sp_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=no_sp_outputs_list,
|
||||
|
||||
Reference in New Issue
Block a user