### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -51,34 +51,25 @@ line-length = 120
|
|||||||
# Folder to be modified
|
# Folder to be modified
|
||||||
exclude = [
|
exclude = [
|
||||||
# Batch (1)
|
# Batch (1)
|
||||||
"tests/e2e/__init__.py",
|
# "tests/e2e/__init__.py",
|
||||||
"tests/e2e/310p/",
|
# "tests/e2e/310p/",
|
||||||
"tests/e2e/conftest.py",
|
# "tests/e2e/conftest.py",
|
||||||
"tests/e2e/doctests/",
|
# "tests/e2e/doctests/",
|
||||||
"tests/e2e/model_utils.py",
|
# "tests/e2e/model_utils.py",
|
||||||
"tests/e2e/models/",
|
# "tests/e2e/models/",
|
||||||
"tests/e2e/multicard/2-cards/",
|
# "tests/e2e/multicard/2-cards/",
|
||||||
|
|
||||||
# Batch (2)
|
# Batch (2)
|
||||||
"tests/e2e/multicard/4-cards/",
|
"tests/e2e/multicard/4-cards/",
|
||||||
"tests/e2e/nightly/multi_node/",
|
"tests/e2e/nightly/multi_node/",
|
||||||
|
|
||||||
# Batch (3)
|
|
||||||
"tests/e2e/nightly/single_node/models/",
|
|
||||||
|
|
||||||
# Batch (4)
|
|
||||||
"tests/e2e/nightly/single_node/ops/",
|
|
||||||
|
|
||||||
# Batch (5)
|
|
||||||
# "tests/e2e/singlecard/",
|
|
||||||
|
|
||||||
# Batch (6)
|
|
||||||
"tests/e2e/nightly/single_node/ops/singlecard_ops/triton/",
|
|
||||||
"tests/e2e/singlecard/pooling/",
|
"tests/e2e/singlecard/pooling/",
|
||||||
"tests/e2e/singlecard/spec_decode/",
|
"tests/e2e/singlecard/spec_decode/",
|
||||||
"tests/e2e/utils.py",
|
"tests/e2e/utils.py",
|
||||||
"tests/e2e/vllm_interface/",
|
"tests/e2e/vllm_interface/",
|
||||||
"tests/e2e/weekly/",
|
"tests/e2e/weekly/",
|
||||||
|
|
||||||
|
# Batch (3)
|
||||||
|
"tests/e2e/nightly/single_node/",
|
||||||
|
|
||||||
"tests/ut/",
|
"tests/ut/",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -15,28 +15,23 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
# Add 310p directory to sys.path
|
# Add 310p directory to sys.path
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
parent_dir = os.path.dirname(current_dir) # 310p directory
|
parent_dir = os.path.dirname(current_dir) # 310p directory
|
||||||
sys.path.insert(0, parent_dir)
|
sys.path.insert(0, parent_dir)
|
||||||
|
|
||||||
|
# ruff: noqa: E402
|
||||||
from test_utils import run_vl_model_test
|
from test_utils import run_vl_model_test
|
||||||
|
|
||||||
|
|
||||||
def test_qwen3_vl_8b_tp2_fp16():
|
def test_qwen3_vl_8b_tp2_fp16():
|
||||||
"""Qwen3-VL-8B dual-card FP16 test"""
|
"""Qwen3-VL-8B dual-card FP16 test"""
|
||||||
run_vl_model_test(
|
run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=2, max_tokens=5)
|
||||||
model_name="Qwen/Qwen3-VL-8B-Instruct",
|
|
||||||
tensor_parallel_size=2,
|
|
||||||
max_tokens=5
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_qwen3_vl_32b_tp1_fp16():
|
def test_qwen3_vl_32b_tp1_fp16():
|
||||||
"""Qwen3-VL-32B 4-card FP16 test"""
|
"""Qwen3-VL-32B 4-card FP16 test"""
|
||||||
run_vl_model_test(
|
run_vl_model_test(model_name="Qwen/Qwen3-VL-32B-Instruct", tensor_parallel_size=4, max_tokens=5)
|
||||||
model_name="Qwen/Qwen3-VL-32B-Instruct",
|
|
||||||
tensor_parallel_size=4,
|
|
||||||
max_tokens=5
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -15,20 +15,18 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
# Add 310p directory to sys.path
|
# Add 310p directory to sys.path
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
parent_dir = os.path.dirname(current_dir) # 310p directory
|
parent_dir = os.path.dirname(current_dir) # 310p directory
|
||||||
sys.path.insert(0, parent_dir)
|
sys.path.insert(0, parent_dir)
|
||||||
|
|
||||||
|
# ruff: noqa: E402
|
||||||
from test_utils import run_vl_model_test
|
from test_utils import run_vl_model_test
|
||||||
|
|
||||||
|
|
||||||
def test_qwen3_vl_8b_tp1_fp16():
|
def test_qwen3_vl_8b_tp1_fp16():
|
||||||
"""Qwen3-VL-8B single-card FP16 test"""
|
"""Qwen3-VL-8B single-card FP16 test"""
|
||||||
run_vl_model_test(
|
run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=1, max_tokens=5)
|
||||||
model_name="Qwen/Qwen3-VL-8B-Instruct",
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
max_tokens=5
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -15,10 +15,12 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
|
||||||
from PIL import Image
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from tests.e2e.conftest import VllmRunner
|
||||||
|
|
||||||
|
|
||||||
def get_test_image():
|
def get_test_image():
|
||||||
"""Get the image object for testing"""
|
"""Get the image object for testing"""
|
||||||
@@ -32,14 +34,12 @@ def get_test_prompts():
|
|||||||
return ["<|image_pad|>Describe this image in detail."]
|
return ["<|image_pad|>Describe this image in detail."]
|
||||||
|
|
||||||
|
|
||||||
def run_vl_model_test(model_name: str,
|
def run_vl_model_test(
|
||||||
tensor_parallel_size: int,
|
model_name: str, tensor_parallel_size: int, max_tokens: int, dtype: str = "float16", enforce_eager: bool = True
|
||||||
max_tokens: int,
|
):
|
||||||
dtype: str = "float16",
|
|
||||||
enforce_eager: bool = True):
|
|
||||||
"""
|
"""
|
||||||
Generic visual language model test function
|
Generic visual language model test function
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_name: Model name, e.g., "Qwen/Qwen3-VL-4B"
|
model_name: Model name, e.g., "Qwen/Qwen3-VL-4B"
|
||||||
tensor_parallel_size: Tensor parallel size
|
tensor_parallel_size: Tensor parallel size
|
||||||
@@ -52,9 +52,6 @@ def run_vl_model_test(model_name: str,
|
|||||||
prompts = get_test_prompts()
|
prompts = get_test_prompts()
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model_name,
|
model_name, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, dtype=dtype
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
dtype=dtype
|
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(prompts, max_tokens, images=images)
|
vllm_model.generate_greedy(prompts, max_tokens, images=images)
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ import threading
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional, Tuple, TypeVar, Union
|
from typing import Any, TypeVar
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import openai
|
import openai
|
||||||
@@ -44,23 +44,20 @@ from modelscope import snapshot_download # type: ignore[import-untyped]
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature
|
||||||
BatchEncoding, BatchFeature)
|
|
||||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config.model import (ConvertOption, RunnerOption,
|
from vllm.config.model import ConvertOption, RunnerOption, _get_and_verify_dtype
|
||||||
_get_and_verify_dtype)
|
|
||||||
from vllm.inputs import TextPrompt
|
from vllm.inputs import TextPrompt
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||||
from vllm.utils.network_utils import get_open_port
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
from tests.e2e.model_utils import (TokensTextLogprobs,
|
from tests.e2e.model_utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
|
||||||
TokensTextLogprobsPromptLogprobs)
|
from tests.e2e.nightly.multi_node.scripts.multi_node_config import DisaggregatedPrefillCfg, NodeInfo
|
||||||
from tests.e2e.nightly.multi_node.scripts.multi_node_config import (
|
|
||||||
DisaggregatedPrefillCfg, NodeInfo)
|
|
||||||
from vllm_ascend.ascend_config import clear_ascend_config
|
from vllm_ascend.ascend_config import clear_ascend_config
|
||||||
|
|
||||||
# TODO: remove this part after the patch merged into vllm, if
|
# TODO: remove this part after the patch merged into vllm, if
|
||||||
# we not explicitly patch here, some of them might be effectiveless
|
# we not explicitly patch here, some of them might be effectiveless
|
||||||
# in pytest scenario
|
# in pytest scenario
|
||||||
@@ -70,41 +67,41 @@ adapt_patch(True)
|
|||||||
adapt_patch(False)
|
adapt_patch(False)
|
||||||
|
|
||||||
from vllm.distributed.parallel_state import ( # noqa E402
|
from vllm.distributed.parallel_state import ( # noqa E402
|
||||||
destroy_distributed_environment, destroy_model_parallel)
|
destroy_distributed_environment,
|
||||||
|
destroy_model_parallel,
|
||||||
|
)
|
||||||
|
|
||||||
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
|
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
|
||||||
_M = TypeVar("_M")
|
_M = TypeVar("_M")
|
||||||
|
|
||||||
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
|
_PromptMultiModalInput = list[_M] | list[list[_M]]
|
||||||
|
|
||||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||||
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
|
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
|
||||||
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_TEST_DIR = os.path.dirname(__file__)
|
_TEST_DIR = os.path.dirname(__file__)
|
||||||
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]
|
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]
|
||||||
|
|
||||||
DISAGG_EPD_PROXY_SCRIPT = Path(
|
DISAGG_EPD_PROXY_SCRIPT = (
|
||||||
__file__
|
Path(__file__).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
|
||||||
).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
|
)
|
||||||
|
|
||||||
|
|
||||||
def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
|
def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
|
||||||
import torch_npu # type: ignore
|
|
||||||
|
|
||||||
# We can try to clean up memory in this subprocess, though it mostly affects this process.
|
# We can try to clean up memory in this subprocess, though it mostly affects this process.
|
||||||
# But if there are any lingering contexts in this process (unlikely for a fresh spawn), it helps.
|
# But if there are any lingering contexts in this process (unlikely for a fresh spawn), it helps.
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.npu.empty_cache()
|
torch.npu.empty_cache()
|
||||||
|
|
||||||
_, total_npu_memory = torch.npu.mem_get_info()
|
_, total_npu_memory = torch.npu.mem_get_info()
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
free_bytes, _ = torch.npu.mem_get_info()
|
free_bytes, _ = torch.npu.mem_get_info()
|
||||||
if free_bytes / total_npu_memory >= target_free_percentage:
|
if free_bytes / total_npu_memory >= target_free_percentage:
|
||||||
print(f'check_npu_memory_worker: npu free memory decreased target value.')
|
print("check_npu_memory_worker: npu free memory decreased target value.")
|
||||||
return # Success
|
return # Success
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
@@ -113,7 +110,7 @@ def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: fl
|
|||||||
print(
|
print(
|
||||||
f"Timeout: NPU memory free size did not reach "
|
f"Timeout: NPU memory free size did not reach "
|
||||||
f"{target_free_percentage} of total npu memory within {max_wait_seconds} seconds.",
|
f"{target_free_percentage} of total npu memory within {max_wait_seconds} seconds.",
|
||||||
file=sys.stderr
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
sys.exit(1) # Failure
|
sys.exit(1) # Failure
|
||||||
|
|
||||||
@@ -135,21 +132,19 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
|
|||||||
target_free_percentage (float): Target free memory percentage of total.
|
target_free_percentage (float): Target free memory percentage of total.
|
||||||
max_wait_seconds (float): Maximum wait time in seconds.
|
max_wait_seconds (float): Maximum wait time in seconds.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def decorator(func):
|
def decorator(func):
|
||||||
@functools.wraps(func)
|
@functools.wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
# Clean up non-NPU resources in the main process
|
# Clean up non-NPU resources in the main process
|
||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|
||||||
# Use a spawned subprocess to check NPU memory to avoid initializing NPU in the main process
|
# Use a spawned subprocess to check NPU memory to avoid initializing NPU in the main process
|
||||||
ctx = multiprocessing.get_context("spawn")
|
ctx = multiprocessing.get_context("spawn")
|
||||||
p = ctx.Process(
|
p = ctx.Process(target=_check_npu_memory_worker, args=(target_free_percentage, max_wait_seconds))
|
||||||
target=_check_npu_memory_worker,
|
|
||||||
args=(target_free_percentage, max_wait_seconds)
|
|
||||||
)
|
|
||||||
p.start()
|
p.start()
|
||||||
p.join()
|
p.join()
|
||||||
|
|
||||||
if p.exitcode != 0:
|
if p.exitcode != 0:
|
||||||
raise TimeoutError(
|
raise TimeoutError(
|
||||||
f"Timeout: NPU memory free size did not reach "
|
f"Timeout: NPU memory free size did not reach "
|
||||||
@@ -157,7 +152,9 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
|
|||||||
)
|
)
|
||||||
|
|
||||||
return func(*args, **kwargs)
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
@@ -168,9 +165,10 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
|||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
if shutdown_ray:
|
if shutdown_ray:
|
||||||
import ray # Lazy import Ray
|
import ray # Lazy import Ray
|
||||||
|
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
# Only clean NPU cache if NPU is already initialized/available in this process.
|
# Only clean NPU cache if NPU is already initialized/available in this process.
|
||||||
# This prevents accidental initialization of NPU context in the main process,
|
# This prevents accidental initialization of NPU context in the main process,
|
||||||
# which would break subsequent forks.
|
# which would break subsequent forks.
|
||||||
@@ -180,7 +178,6 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
|||||||
|
|
||||||
|
|
||||||
class MooncakeLauncher:
|
class MooncakeLauncher:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
mooncake_port,
|
mooncake_port,
|
||||||
@@ -228,14 +225,12 @@ class MooncakeLauncher:
|
|||||||
class RemoteOpenAIServer:
|
class RemoteOpenAIServer:
|
||||||
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
|
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
|
||||||
|
|
||||||
def _start_server(self, model: str, server_cmd: list[str],
|
def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
|
||||||
env_dict: Optional[dict[str, str]]) -> None:
|
"""Subclasses override this method to customize server process launch"""
|
||||||
"""Subclasses override this method to customize server process launch
|
|
||||||
"""
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
# the current process might initialize npu,
|
# the current process might initialize npu,
|
||||||
# to be safe, we should use spawn method
|
# to be safe, we should use spawn method
|
||||||
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
if env_dict is not None:
|
if env_dict is not None:
|
||||||
env.update(env_dict)
|
env.update(env_dict)
|
||||||
logger.info(f"Starting server with command: {' '.join(server_cmd)}")
|
logger.info(f"Starting server with command: {' '.join(server_cmd)}")
|
||||||
@@ -247,47 +242,41 @@ class RemoteOpenAIServer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
vllm_serve_args: Union[list[str], str],
|
vllm_serve_args: list[str] | str,
|
||||||
*,
|
*,
|
||||||
server_host: str = '0.0.0.0',
|
server_host: str = "0.0.0.0",
|
||||||
server_port: int = 8080,
|
server_port: int = 8080,
|
||||||
env_dict: Optional[dict[str, str]] = None,
|
env_dict: dict[str, str] | None = None,
|
||||||
seed: Optional[int] = None,
|
seed: int | None = None,
|
||||||
auto_port: bool = True,
|
auto_port: bool = True,
|
||||||
nodes_info: Optional[list[NodeInfo]] = None,
|
nodes_info: list[NodeInfo] | None = None,
|
||||||
disaggregated_prefill: Optional[DisaggregatedPrefillCfg] = None,
|
disaggregated_prefill: DisaggregatedPrefillCfg | None = None,
|
||||||
proxy_port: Optional[int] = None,
|
proxy_port: int | None = None,
|
||||||
max_wait_seconds: Optional[float] = None,
|
max_wait_seconds: float | None = None,
|
||||||
override_hf_configs: Optional[dict[str, Any]] = None) -> None:
|
override_hf_configs: dict[str, Any] | None = None,
|
||||||
|
) -> None:
|
||||||
if isinstance(vllm_serve_args, str):
|
if isinstance(vllm_serve_args, str):
|
||||||
vllm_serve_args = shlex.split(vllm_serve_args)
|
vllm_serve_args = shlex.split(vllm_serve_args)
|
||||||
else:
|
else:
|
||||||
vllm_serve_args = ["vllm", "serve", model, *vllm_serve_args]
|
vllm_serve_args = ["vllm", "serve", model, *vllm_serve_args]
|
||||||
if auto_port:
|
if auto_port:
|
||||||
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
|
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
|
||||||
raise ValueError("You have manually specified the port "
|
raise ValueError("You have manually specified the port when `auto_port=True`.")
|
||||||
"when `auto_port=True`.")
|
|
||||||
|
|
||||||
# No need for a port if using unix sockets
|
# No need for a port if using unix sockets
|
||||||
if "--uds" not in vllm_serve_args:
|
if "--uds" not in vllm_serve_args:
|
||||||
# Don't mutate the input args
|
# Don't mutate the input args
|
||||||
vllm_serve_args = vllm_serve_args + [
|
vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
|
||||||
"--port", str(get_open_port())
|
|
||||||
]
|
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
if "--seed" in vllm_serve_args:
|
if "--seed" in vllm_serve_args:
|
||||||
raise ValueError("You have manually specified the seed "
|
raise ValueError(f"You have manually specified the seed when `seed={seed}`.")
|
||||||
f"when `seed={seed}`.")
|
|
||||||
|
|
||||||
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
|
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
|
||||||
|
|
||||||
if override_hf_configs is not None:
|
if override_hf_configs is not None:
|
||||||
vllm_serve_args = vllm_serve_args + [
|
vllm_serve_args = vllm_serve_args + ["--hf-overrides", json.dumps(override_hf_configs)]
|
||||||
"--hf-overrides",
|
|
||||||
json.dumps(override_hf_configs)
|
|
||||||
]
|
|
||||||
|
|
||||||
self.host = str(server_host)
|
self.host = str(server_host)
|
||||||
self.port = int(server_port)
|
self.port = int(server_port)
|
||||||
@@ -303,9 +292,7 @@ class RemoteOpenAIServer:
|
|||||||
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
|
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
|
||||||
self._wait_for_server_pd(timeout=max_wait_seconds)
|
self._wait_for_server_pd(timeout=max_wait_seconds)
|
||||||
else:
|
else:
|
||||||
self._wait_for_multiple_servers(
|
self._wait_for_multiple_servers([(self.host, self.url_for("health"))], timeout=max_wait_seconds)
|
||||||
[(self.host, self.url_for("health"))],
|
|
||||||
timeout=max_wait_seconds)
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
@@ -313,7 +300,7 @@ class RemoteOpenAIServer:
|
|||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
self._terminate_server()
|
self._terminate_server()
|
||||||
|
|
||||||
def _poll(self) -> Optional[int]:
|
def _poll(self) -> int | None:
|
||||||
"""Subclasses override this method to customize process polling"""
|
"""Subclasses override this method to customize process polling"""
|
||||||
return self.proc.poll()
|
return self.proc.poll()
|
||||||
|
|
||||||
@@ -345,24 +332,23 @@ class RemoteOpenAIServer:
|
|||||||
def url_health(ip: str, port: int) -> str:
|
def url_health(ip: str, port: int) -> str:
|
||||||
return f"http://{ip}:{port}/health"
|
return f"http://{ip}:{port}/health"
|
||||||
|
|
||||||
targets = [(node_info.ip, url_health(node_info.ip, self.port))
|
targets = [
|
||||||
for node_info in self.nodes_info if not node_info.headless]
|
(node_info.ip, url_health(node_info.ip, self.port))
|
||||||
|
for node_info in self.nodes_info
|
||||||
|
if not node_info.headless
|
||||||
|
]
|
||||||
|
|
||||||
# Wait for proxy ready
|
# Wait for proxy ready
|
||||||
master_node = self.nodes_info[0]
|
master_node = self.nodes_info[0]
|
||||||
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
|
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
|
||||||
|
|
||||||
# Wait for master node proxy first
|
# Wait for master node proxy first
|
||||||
self._wait_for_multiple_servers([(master_node.ip, url_proxy)],
|
self._wait_for_multiple_servers([(master_node.ip, url_proxy)], timeout=timeout)
|
||||||
timeout=timeout)
|
|
||||||
|
|
||||||
# Then wait for all api_server nodes
|
# Then wait for all api_server nodes
|
||||||
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
||||||
|
|
||||||
def _wait_for_multiple_servers(self,
|
def _wait_for_multiple_servers(self, targets, timeout: float, log_interval: float = 30.0):
|
||||||
targets,
|
|
||||||
timeout: float,
|
|
||||||
log_interval: float = 30.0):
|
|
||||||
"""
|
"""
|
||||||
targets: List[(node_ip, url)]
|
targets: List[(node_ip, url)]
|
||||||
log_interval
|
log_interval
|
||||||
@@ -396,9 +382,7 @@ class RemoteOpenAIServer:
|
|||||||
# check unexpected exit
|
# check unexpected exit
|
||||||
result = self._poll()
|
result = self._poll()
|
||||||
if result is not None and result != 0:
|
if result is not None and result != 0:
|
||||||
raise RuntimeError(
|
raise RuntimeError(f"Server at {node_ip} exited unexpectedly.") from None
|
||||||
f"Server at {node_ip} exited unexpectedly."
|
|
||||||
) from None
|
|
||||||
|
|
||||||
if should_log:
|
if should_log:
|
||||||
last_log_time = now
|
last_log_time = now
|
||||||
@@ -444,35 +428,31 @@ class RemoteOpenAIServer:
|
|||||||
def get_async_client(self, **kwargs):
|
def get_async_client(self, **kwargs):
|
||||||
if "timeout" not in kwargs:
|
if "timeout" not in kwargs:
|
||||||
kwargs["timeout"] = 600
|
kwargs["timeout"] = 600
|
||||||
return openai.AsyncOpenAI(base_url=self.url_for("v1"),
|
return openai.AsyncOpenAI(base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs)
|
||||||
api_key=self.DUMMY_API_KEY,
|
|
||||||
max_retries=0,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteEPDServer(RemoteOpenAIServer):
|
class RemoteEPDServer(RemoteOpenAIServer):
|
||||||
def _start_server(self, model: str, server_cmd: list[str],
|
def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
|
||||||
env_dict: Optional[dict[str, str]]) -> None:
|
"""Subclasses override this method to customize server process launch"""
|
||||||
"""Subclasses override this method to customize server process launch
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
|
raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
vllm_serve_args: Union[list[str], list[list[str]]],
|
self,
|
||||||
server_host: str = '0.0.0.0',
|
vllm_serve_args: list[str] | list[list[str]],
|
||||||
env_dict: Optional[dict[str, str]] = None,
|
server_host: str = "0.0.0.0",
|
||||||
max_wait_seconds: Optional[float] = 2800) -> None:
|
env_dict: dict[str, str] | None = None,
|
||||||
|
max_wait_seconds: float | None = 2800,
|
||||||
|
) -> None:
|
||||||
self._proc_list = []
|
self._proc_list = []
|
||||||
|
|
||||||
self.env_dict: dict[str, str] = {}
|
self.env_dict: dict[str, str] = {}
|
||||||
if env_dict is not None:
|
if env_dict is not None:
|
||||||
self.env_dict.update(env_dict)
|
self.env_dict.update(env_dict)
|
||||||
|
|
||||||
self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1"
|
self.env_dict["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
|
||||||
self.env_dict['VLLM_USE_V1'] = "1"
|
self.env_dict["VLLM_USE_V1"] = "1"
|
||||||
self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True"
|
self.env_dict["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
|
||||||
self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
self.env_dict["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|
||||||
self.vllm_serve_args_list = []
|
self.vllm_serve_args_list = []
|
||||||
self.health_url_list = []
|
self.health_url_list = []
|
||||||
@@ -484,8 +464,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
|||||||
self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
|
self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
|
||||||
else:
|
else:
|
||||||
self.vllm_serve_args_list = [
|
self.vllm_serve_args_list = [
|
||||||
[str(arg) for arg in sublist]
|
[str(arg) for arg in sublist] for sublist in copy.deepcopy(vllm_serve_args)
|
||||||
for sublist in copy.deepcopy(vllm_serve_args)
|
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("vllm_serves_args must be a list")
|
raise RuntimeError("vllm_serves_args must be a list")
|
||||||
@@ -493,7 +472,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
|||||||
serve_arg_cmd = ["vllm", "serve"]
|
serve_arg_cmd = ["vllm", "serve"]
|
||||||
|
|
||||||
for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
|
for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
|
||||||
self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i)
|
self.env_dict["ASCEND_RT_VISIBLE_DEVICES"] = str(i)
|
||||||
if isinstance(vllm_serve_arg, list):
|
if isinstance(vllm_serve_arg, list):
|
||||||
if "--port" not in vllm_serve_arg:
|
if "--port" not in vllm_serve_arg:
|
||||||
raise ValueError("You have manually specified the port ")
|
raise ValueError("You have manually specified the port ")
|
||||||
@@ -514,16 +493,13 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
|||||||
|
|
||||||
self.health_url_list.append(super().url_for("health"))
|
self.health_url_list.append(super().url_for("health"))
|
||||||
vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
|
vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
|
||||||
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict,
|
proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict, f"[VLLM_{i}] ")
|
||||||
f"[VLLM_{i}] ")
|
|
||||||
self._proc_list.append(proc)
|
self._proc_list.append(proc)
|
||||||
|
|
||||||
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
||||||
super()._wait_for_multiple_servers([(self.host, url)
|
super()._wait_for_multiple_servers([(self.host, url) for url in self.health_url_list], timeout=timeout_value)
|
||||||
for url in self.health_url_list],
|
|
||||||
timeout=timeout_value)
|
|
||||||
|
|
||||||
def _poll(self) -> Optional[int]:
|
def _poll(self) -> int | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _delete_shm(self) -> None:
|
def _delete_shm(self) -> None:
|
||||||
@@ -542,31 +518,23 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
|||||||
def _read_output(self, pipe, prefix):
|
def _read_output(self, pipe, prefix):
|
||||||
try:
|
try:
|
||||||
with pipe:
|
with pipe:
|
||||||
for line in iter(pipe.readline, ''):
|
for line in iter(pipe.readline, ""):
|
||||||
if line:
|
if line:
|
||||||
print(f"{prefix}: {line}", end='')
|
print(f"{prefix}: {line}", end="")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"error: {e}")
|
print(f"error: {e}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
def _start_server_with_prefix(self, server_cmd: list[str],
|
def _start_server_with_prefix(self, server_cmd: list[str], env_dict: dict[str, str] | None, log_prefix: str):
|
||||||
env_dict: Optional[dict[str, str]], log_prefix: str):
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
if env_dict is not None:
|
if env_dict is not None:
|
||||||
env.update(env_dict)
|
env.update(env_dict)
|
||||||
proc = subprocess.Popen(server_cmd,
|
proc = subprocess.Popen(
|
||||||
env=env,
|
server_cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, bufsize=1
|
||||||
stdout=subprocess.PIPE,
|
)
|
||||||
stderr=subprocess.PIPE,
|
stdout_thread = threading.Thread(target=self._read_output, args=(proc.stdout, log_prefix), daemon=True)
|
||||||
universal_newlines=True,
|
stderr_thread = threading.Thread(target=self._read_output, args=(proc.stderr, log_prefix), daemon=True)
|
||||||
bufsize=1)
|
|
||||||
stdout_thread = threading.Thread(target=self._read_output,
|
|
||||||
args=(proc.stdout, log_prefix),
|
|
||||||
daemon=True)
|
|
||||||
stderr_thread = threading.Thread(target=self._read_output,
|
|
||||||
args=(proc.stderr, log_prefix),
|
|
||||||
daemon=True)
|
|
||||||
|
|
||||||
stdout_thread.start()
|
stdout_thread.start()
|
||||||
stderr_thread.start()
|
stderr_thread.start()
|
||||||
@@ -579,27 +547,21 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
|||||||
parent = psutil.Process(proc.pid)
|
parent = psutil.Process(proc.pid)
|
||||||
children = parent.children(recursive=True)
|
children = parent.children(recursive=True)
|
||||||
for child in children:
|
for child in children:
|
||||||
try:
|
with contextlib.suppress(psutil.NoSuchProcess):
|
||||||
child.terminate()
|
child.terminate()
|
||||||
except psutil.NoSuchProcess:
|
|
||||||
pass
|
|
||||||
|
|
||||||
gone, still_alive = psutil.wait_procs(children, timeout=10)
|
gone, still_alive = psutil.wait_procs(children, timeout=10)
|
||||||
|
|
||||||
for child in still_alive:
|
for child in still_alive:
|
||||||
try:
|
with contextlib.suppress(psutil.NoSuchProcess):
|
||||||
child.kill()
|
child.kill()
|
||||||
except psutil.NoSuchProcess:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parent.terminate()
|
parent.terminate()
|
||||||
parent.wait(timeout=10)
|
parent.wait(timeout=10)
|
||||||
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
||||||
try:
|
with contextlib.suppress(psutil.NoSuchProcess):
|
||||||
parent.kill()
|
parent.kill()
|
||||||
except psutil.NoSuchProcess:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
"""Context manager entry point."""
|
"""Context manager entry point."""
|
||||||
@@ -611,13 +573,13 @@ class RemoteEPDServer(RemoteOpenAIServer):
|
|||||||
|
|
||||||
|
|
||||||
class DisaggEpdProxy(RemoteEPDServer):
|
class DisaggEpdProxy(RemoteEPDServer):
|
||||||
|
def __init__(
|
||||||
def __init__(self,
|
self,
|
||||||
proxy_args: Optional[Union[list[str], str]] = None,
|
proxy_args: list[str] | str | None = None,
|
||||||
env_dict: Optional[dict[str, str]] = None,
|
env_dict: dict[str, str] | None = None,
|
||||||
server_host: str = '0.0.0.0',
|
server_host: str = "0.0.0.0",
|
||||||
max_wait_seconds: Optional[float] = 2800) -> None:
|
max_wait_seconds: float | None = 2800,
|
||||||
|
) -> None:
|
||||||
if proxy_args is None:
|
if proxy_args is None:
|
||||||
proxy_args_list: list[str] = []
|
proxy_args_list: list[str] = []
|
||||||
elif isinstance(proxy_args, str):
|
elif isinstance(proxy_args, str):
|
||||||
@@ -648,8 +610,7 @@ class DisaggEpdProxy(RemoteEPDServer):
|
|||||||
self.port = int(port_str)
|
self.port = int(port_str)
|
||||||
|
|
||||||
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
|
||||||
super()._wait_for_multiple_servers(
|
super()._wait_for_multiple_servers([(self.host, super().url_for("health"))], timeout=timeout_value)
|
||||||
[(self.host, super().url_for("health"))], timeout=timeout_value)
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
"""Context manager entry point."""
|
"""Context manager entry point."""
|
||||||
@@ -661,23 +622,22 @@ class DisaggEpdProxy(RemoteEPDServer):
|
|||||||
|
|
||||||
|
|
||||||
class VllmRunner:
|
class VllmRunner:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
convert: ConvertOption = "auto",
|
convert: ConvertOption = "auto",
|
||||||
tokenizer_name: Optional[str] = None,
|
tokenizer_name: str | None = None,
|
||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
max_model_len: Optional[int] = 1024,
|
max_model_len: int | None = 1024,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
disable_log_stats: bool = True,
|
disable_log_stats: bool = True,
|
||||||
tensor_parallel_size: int = 1,
|
tensor_parallel_size: int = 1,
|
||||||
block_size: int = 16,
|
block_size: int = 16,
|
||||||
enable_chunked_prefill: bool = True,
|
enable_chunked_prefill: bool = True,
|
||||||
swap_space: int = 4,
|
swap_space: int = 4,
|
||||||
enforce_eager: Optional[bool] = False,
|
enforce_eager: bool | None = False,
|
||||||
quantization: Optional[str] = None,
|
quantization: str | None = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.model = LLM(
|
self.model = LLM(
|
||||||
@@ -701,17 +661,13 @@ class VllmRunner:
|
|||||||
|
|
||||||
def get_inputs(
|
def get_inputs(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor], list[int]],
|
prompts: list[str] | list[torch.Tensor] | list[int],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
) -> list[TextPrompt]:
|
) -> list[TextPrompt]:
|
||||||
|
if any(x is not None and len(x) != len(prompts) for x in [images, videos, audios]):
|
||||||
if any(x is not None and len(x) != len(prompts)
|
raise ValueError("All non-None multimodal inputs must have the same length as prompts")
|
||||||
for x in [images, videos, audios]):
|
|
||||||
raise ValueError(
|
|
||||||
"All non-None multimodal inputs must have the same length as "
|
|
||||||
"prompts")
|
|
||||||
|
|
||||||
inputs = []
|
inputs = []
|
||||||
for i, prompt in enumerate(prompts):
|
for i, prompt in enumerate(prompts):
|
||||||
@@ -719,13 +675,11 @@ class VllmRunner:
|
|||||||
if images is not None and (image := images[i]) is not None:
|
if images is not None and (image := images[i]) is not None:
|
||||||
multi_modal_data["image"] = image
|
multi_modal_data["image"] = image
|
||||||
if videos is not None and (video := videos[i]) is not None:
|
if videos is not None and (video := videos[i]) is not None:
|
||||||
multi_modal_data["video"] = video # type: ignore
|
multi_modal_data["video"] = video # type: ignore
|
||||||
if audios is not None and (audio := audios[i]) is not None:
|
if audios is not None and (audio := audios[i]) is not None:
|
||||||
multi_modal_data["audio"] = audio # type: ignore
|
multi_modal_data["audio"] = audio # type: ignore
|
||||||
|
|
||||||
text_prompt_kwargs: dict[str, Any] = {
|
text_prompt_kwargs: dict[str, Any] = {"multi_modal_data": multi_modal_data or None}
|
||||||
"multi_modal_data": multi_modal_data or None
|
|
||||||
}
|
|
||||||
if isinstance(prompt, str):
|
if isinstance(prompt, str):
|
||||||
text_prompt_kwargs["prompt"] = prompt
|
text_prompt_kwargs["prompt"] = prompt
|
||||||
elif isinstance(prompt, list):
|
elif isinstance(prompt, list):
|
||||||
@@ -739,21 +693,16 @@ class VllmRunner:
|
|||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor]],
|
prompts: list[str] | list[torch.Tensor],
|
||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[list[list[int]], list[str]]]:
|
) -> list[tuple[list[list[int]], list[str]]]:
|
||||||
inputs = self.get_inputs(prompts,
|
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||||
images=images,
|
|
||||||
videos=videos,
|
|
||||||
audios=audios)
|
|
||||||
|
|
||||||
req_outputs = self.model.generate(inputs,
|
req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)
|
||||||
sampling_params=sampling_params,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
outputs: list[tuple[list[list[int]], list[str]]] = []
|
outputs: list[tuple[list[list[int]], list[str]]] = []
|
||||||
for req_output in req_outputs:
|
for req_output in req_outputs:
|
||||||
@@ -780,99 +729,83 @@ class VllmRunner:
|
|||||||
output_str = sample.text
|
output_str = sample.text
|
||||||
output_ids = list(sample.token_ids)
|
output_ids = list(sample.token_ids)
|
||||||
output_logprobs = sample.logprobs
|
output_logprobs = sample.logprobs
|
||||||
outputs.append((output_ids, output_str, output_logprobs,
|
outputs.append((output_ids, output_str, output_logprobs, req_output.prompt_logprobs))
|
||||||
req_output.prompt_logprobs))
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def generate_w_logprobs(
|
def generate_w_logprobs(
|
||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Union[list[TokensTextLogprobs],
|
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||||
list[TokensTextLogprobsPromptLogprobs]]:
|
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||||
inputs = self.get_inputs(prompts,
|
|
||||||
images=images,
|
|
||||||
videos=videos,
|
|
||||||
audios=audios)
|
|
||||||
|
|
||||||
req_outputs = self.model.generate(inputs,
|
req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)
|
||||||
sampling_params=sampling_params,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
toks_str_logsprobs_prompt_logprobs = (
|
toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(req_outputs)
|
||||||
self._final_steps_generate_w_logprobs(req_outputs))
|
|
||||||
# Omit prompt logprobs if not required by sampling params
|
# Omit prompt logprobs if not required by sampling params
|
||||||
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
return (
|
||||||
if sampling_params.prompt_logprobs is None else
|
[x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
||||||
toks_str_logsprobs_prompt_logprobs)
|
if sampling_params.prompt_logprobs is None
|
||||||
|
else toks_str_logsprobs_prompt_logprobs
|
||||||
|
)
|
||||||
|
|
||||||
def generate_greedy(
|
def generate_greedy(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor]],
|
prompts: list[str] | list[torch.Tensor],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[list[int], str]]:
|
) -> list[tuple[list[int], str]]:
|
||||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
||||||
outputs = self.generate(prompts,
|
outputs = self.generate(prompts, greedy_params, images=images, videos=videos, audios=audios, **kwargs)
|
||||||
greedy_params,
|
return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
|
||||||
images=images,
|
|
||||||
videos=videos,
|
|
||||||
audios=audios,
|
|
||||||
**kwargs)
|
|
||||||
return [(output_ids[0], output_str[0])
|
|
||||||
for output_ids, output_str in outputs]
|
|
||||||
|
|
||||||
def generate_greedy_logprobs(
|
def generate_greedy_logprobs(
|
||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: Optional[int],
|
num_logprobs: int | None,
|
||||||
num_prompt_logprobs: Optional[int] = None,
|
num_prompt_logprobs: int | None = None,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
stop_token_ids: Optional[list[int]] = None,
|
stop_token_ids: list[int] | None = None,
|
||||||
stop: Optional[list[str]] = None,
|
stop: list[str] | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Union[list[TokensTextLogprobs],
|
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||||
list[TokensTextLogprobsPromptLogprobs]]:
|
|
||||||
greedy_logprobs_params = SamplingParams(
|
greedy_logprobs_params = SamplingParams(
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
logprobs=num_logprobs,
|
logprobs=num_logprobs,
|
||||||
prompt_logprobs=num_prompt_logprobs,
|
prompt_logprobs=num_prompt_logprobs,
|
||||||
stop_token_ids=stop_token_ids,
|
stop_token_ids=stop_token_ids,
|
||||||
stop=stop)
|
stop=stop,
|
||||||
|
)
|
||||||
|
|
||||||
return self.generate_w_logprobs(prompts,
|
return self.generate_w_logprobs(
|
||||||
greedy_logprobs_params,
|
prompts, greedy_logprobs_params, images=images, audios=audios, videos=videos, **kwargs
|
||||||
images=images,
|
)
|
||||||
audios=audios,
|
|
||||||
videos=videos,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
def classify(self, prompts: list[str]) -> list[list[float]]:
|
def classify(self, prompts: list[str]) -> list[list[float]]:
|
||||||
req_outputs = self.model.classify(prompts)
|
req_outputs = self.model.classify(prompts)
|
||||||
return [req_output.outputs.probs for req_output in req_outputs]
|
return [req_output.outputs.probs for req_output in req_outputs]
|
||||||
|
|
||||||
def embed(self,
|
def embed(
|
||||||
prompts: list[str],
|
self,
|
||||||
images: Optional[PromptImageInput] = None,
|
prompts: list[str],
|
||||||
videos: Optional[PromptVideoInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
*args,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs) -> list[list[float]]:
|
*args,
|
||||||
inputs = self.get_inputs(prompts,
|
**kwargs,
|
||||||
images=images,
|
) -> list[list[float]]:
|
||||||
videos=videos,
|
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||||
audios=audios)
|
|
||||||
|
|
||||||
req_outputs = self.model.embed(inputs, *args, **kwargs)
|
req_outputs = self.model.embed(inputs, *args, **kwargs)
|
||||||
return [req_output.outputs.embedding for req_output in req_outputs]
|
return [req_output.outputs.embedding for req_output in req_outputs]
|
||||||
@@ -887,8 +820,8 @@ class VllmRunner:
|
|||||||
|
|
||||||
def score(
|
def score(
|
||||||
self,
|
self,
|
||||||
text_1: Union[str, list[str]],
|
text_1: str | list[str],
|
||||||
text_2: Union[str, list[str]],
|
text_2: str | list[str],
|
||||||
*args,
|
*args,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list[float]:
|
) -> list[float]:
|
||||||
@@ -905,14 +838,11 @@ class VllmRunner:
|
|||||||
|
|
||||||
|
|
||||||
class HfRunner:
|
class HfRunner:
|
||||||
|
|
||||||
def get_default_device(self):
|
def get_default_device(self):
|
||||||
|
return "cpu" if current_platform.is_cpu() else current_platform.device_type
|
||||||
|
|
||||||
return ("cpu"
|
def wrap_device(self, x: _T, device: str | None = None) -> _T:
|
||||||
if current_platform.is_cpu() else current_platform.device_type)
|
if x is None or isinstance(x, (bool,)):
|
||||||
|
|
||||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
|
||||||
if x is None or isinstance(x, (bool, )):
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
if device is None:
|
if device is None:
|
||||||
@@ -931,7 +861,7 @@ class HfRunner:
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
*,
|
*,
|
||||||
model_kwargs: Optional[dict[str, Any]] = None,
|
model_kwargs: dict[str, Any] | None = None,
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
is_sentence_transformer: bool = False,
|
is_sentence_transformer: bool = False,
|
||||||
is_cross_encoder: bool = False,
|
is_cross_encoder: bool = False,
|
||||||
@@ -984,14 +914,15 @@ class HfRunner:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# in case some unquantized custom models are not in same dtype
|
# in case some unquantized custom models are not in same dtype
|
||||||
if (getattr(model, "quantization_method", None) is None
|
if getattr(model, "quantization_method", None) is None and any(
|
||||||
and any(p.dtype != self.dtype
|
p.dtype != self.dtype for p in model.parameters()
|
||||||
for p in model.parameters())):
|
):
|
||||||
model = model.to(dtype=self.dtype)
|
model = model.to(dtype=self.dtype)
|
||||||
|
|
||||||
if (getattr(model, "quantization_method", None) != "bitsandbytes"
|
if (
|
||||||
and len({p.device
|
getattr(model, "quantization_method", None) != "bitsandbytes"
|
||||||
for p in model.parameters()}) < 2):
|
and len({p.device for p in model.parameters()}) < 2
|
||||||
|
):
|
||||||
model = model.to(device=self.device)
|
model = model.to(device=self.device)
|
||||||
|
|
||||||
self.model = model
|
self.model = model
|
||||||
@@ -1006,6 +937,7 @@ class HfRunner:
|
|||||||
# don't put this import at the top level
|
# don't put this import at the top level
|
||||||
# it will call torch.cuda.device_count()
|
# it will call torch.cuda.device_count()
|
||||||
from transformers import AutoProcessor # noqa: F401
|
from transformers import AutoProcessor # noqa: F401
|
||||||
|
|
||||||
self.processor = AutoProcessor.from_pretrained(
|
self.processor = AutoProcessor.from_pretrained(
|
||||||
model_name,
|
model_name,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
@@ -1017,10 +949,10 @@ class HfRunner:
|
|||||||
def get_inputs(
|
def get_inputs(
|
||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
) -> list[Union[BatchFeature, BatchEncoding]]:
|
) -> list[BatchFeature | BatchEncoding]:
|
||||||
if images is not None:
|
if images is not None:
|
||||||
assert len(prompts) == len(images)
|
assert len(prompts) == len(images)
|
||||||
|
|
||||||
@@ -1030,7 +962,7 @@ class HfRunner:
|
|||||||
if audios is not None:
|
if audios is not None:
|
||||||
assert len(prompts) == len(audios)
|
assert len(prompts) == len(audios)
|
||||||
|
|
||||||
all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
|
all_inputs: list[BatchFeature | BatchEncoding] = []
|
||||||
for i, prompt in enumerate(prompts):
|
for i, prompt in enumerate(prompts):
|
||||||
processor_kwargs: dict[str, Any] = {
|
processor_kwargs: dict[str, Any] = {
|
||||||
"text": prompt,
|
"text": prompt,
|
||||||
@@ -1076,16 +1008,11 @@ class HfRunner:
|
|||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def encode(self, prompts: list[str], *args,
|
def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]:
|
||||||
**kwargs) -> list[list[torch.Tensor]]:
|
|
||||||
return self.model.encode(prompts, *args, **kwargs)
|
return self.model.encode(prompts, *args, **kwargs)
|
||||||
|
|
||||||
def predict(self, prompts: list[list[str]], *args,
|
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||||
**kwargs) -> torch.Tensor:
|
return self.model.predict(prompts, *args, convert_to_tensor=True, **kwargs)
|
||||||
return self.model.predict(prompts,
|
|
||||||
*args,
|
|
||||||
convert_to_tensor=True,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
@@ -1103,22 +1030,25 @@ def ilama_lora_files():
|
|||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def llama32_lora_files():
|
def llama32_lora_files():
|
||||||
from huggingface_hub import snapshot_download as hf_snapshot_download
|
from huggingface_hub import snapshot_download as hf_snapshot_download
|
||||||
|
|
||||||
return hf_snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider", local_files_only=True)
|
return hf_snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider", local_files_only=True)
|
||||||
|
|
||||||
|
|
||||||
def qwen_prompt(questions: list[str]) -> list[str]:
|
def qwen_prompt(questions: list[str]) -> list[str]:
|
||||||
placeholder = "<|image_pad|>"
|
placeholder = "<|image_pad|>"
|
||||||
return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
return [
|
||||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
(
|
||||||
f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
|
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||||
|
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||||
|
f"{q}<|im_end|>\n<|im_start|>assistant\n"
|
||||||
|
)
|
||||||
|
for q in questions
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def hunyuan_prompt(questions: list[str]) -> list[str]:
|
def hunyuan_prompt(questions: list[str]) -> list[str]:
|
||||||
placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
|
placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
|
||||||
return [
|
return [f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" for question in questions]
|
||||||
f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>"
|
|
||||||
for question in questions
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PROMPT_CONFIGS = {
|
PROMPT_CONFIGS = {
|
||||||
|
|||||||
@@ -17,11 +17,11 @@
|
|||||||
# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
|
# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
|
||||||
#
|
#
|
||||||
|
|
||||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
from collections.abc import Sequence
|
||||||
|
|
||||||
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||||
|
|
||||||
TokensText = Tuple[List[int], str]
|
TokensText = tuple[list[int], str]
|
||||||
|
|
||||||
|
|
||||||
def check_outputs_equal(
|
def check_outputs_equal(
|
||||||
@@ -37,18 +37,18 @@ def check_outputs_equal(
|
|||||||
"""
|
"""
|
||||||
assert len(outputs_0_lst) == len(outputs_1_lst)
|
assert len(outputs_0_lst) == len(outputs_1_lst)
|
||||||
|
|
||||||
for prompt_idx, (outputs_0,
|
for prompt_idx, (outputs_0, outputs_1) in enumerate(zip(outputs_0_lst, outputs_1_lst)):
|
||||||
outputs_1) in enumerate(zip(outputs_0_lst,
|
|
||||||
outputs_1_lst)):
|
|
||||||
output_ids_0, output_str_0 = outputs_0
|
output_ids_0, output_str_0 = outputs_0
|
||||||
output_ids_1, output_str_1 = outputs_1
|
output_ids_1, output_str_1 = outputs_1
|
||||||
|
|
||||||
# The text and token outputs should exactly match
|
# The text and token outputs should exactly match
|
||||||
fail_msg = (f"Test{prompt_idx}:"
|
fail_msg = (
|
||||||
f"\n{name_0}:\t{output_str_0!r}"
|
f"Test{prompt_idx}:"
|
||||||
f"\n{name_1}:\t{output_str_1!r}"
|
f"\n{name_0}:\t{output_str_0!r}"
|
||||||
f"\n{name_0}:\t{output_ids_0!r}"
|
f"\n{name_1}:\t{output_str_1!r}"
|
||||||
f"\n{name_1}:\t{output_ids_1!r}")
|
f"\n{name_0}:\t{output_ids_0!r}"
|
||||||
|
f"\n{name_1}:\t{output_ids_1!r}"
|
||||||
|
)
|
||||||
|
|
||||||
assert output_str_0 == output_str_1, fail_msg
|
assert output_str_0 == output_str_1, fail_msg
|
||||||
assert output_ids_0 == output_ids_1, fail_msg
|
assert output_ids_0 == output_ids_1, fail_msg
|
||||||
@@ -60,9 +60,7 @@ def check_outputs_equal(
|
|||||||
# * List of top sample logprobs for each sampled token
|
# * List of top sample logprobs for each sampled token
|
||||||
#
|
#
|
||||||
# Assumes prompt logprobs were not requested.
|
# Assumes prompt logprobs were not requested.
|
||||||
TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
|
TokensTextLogprobs = tuple[list[int], str, list[dict[int, float]] | SampleLogprobs | None]
|
||||||
float]],
|
|
||||||
SampleLogprobs]]]
|
|
||||||
|
|
||||||
# Representation of generated sequence as a tuple of
|
# Representation of generated sequence as a tuple of
|
||||||
# * Token ID list
|
# * Token ID list
|
||||||
@@ -71,6 +69,9 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
|
|||||||
# * Optional list of top prompt logprobs for each prompt token
|
# * Optional list of top prompt logprobs for each prompt token
|
||||||
#
|
#
|
||||||
# Allows prompt logprobs to be requested.
|
# Allows prompt logprobs to be requested.
|
||||||
TokensTextLogprobsPromptLogprobs = Tuple[
|
TokensTextLogprobsPromptLogprobs = tuple[
|
||||||
List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
|
list[int],
|
||||||
Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
|
str,
|
||||||
|
list[dict[int, float]] | SampleLogprobs | None,
|
||||||
|
list[dict[int, float] | None] | PromptLogprobs | None,
|
||||||
|
]
|
||||||
|
|||||||
@@ -55,16 +55,12 @@ def report_dir(pytestconfig):
|
|||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
def pytest_generate_tests(metafunc):
|
||||||
if "config_filename" in metafunc.fixturenames:
|
if "config_filename" in metafunc.fixturenames:
|
||||||
|
|
||||||
if metafunc.config.getoption("--config-list-file"):
|
if metafunc.config.getoption("--config-list-file"):
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
rel_path = metafunc.config.getoption("--config-list-file")
|
||||||
config_list_file = Path(rel_path).resolve()
|
config_list_file = Path(rel_path).resolve()
|
||||||
config_dir = config_list_file.parent
|
config_dir = config_list_file.parent
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
with open(config_list_file, encoding="utf-8") as f:
|
||||||
configs = [
|
configs = [config_dir / line.strip() for line in f if line.strip() and not line.startswith("#")]
|
||||||
config_dir / line.strip() for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
metafunc.parametrize("config_filename", configs)
|
||||||
else:
|
else:
|
||||||
single_config = metafunc.config.getoption("--config")
|
single_config = metafunc.config.getoption("--config")
|
||||||
|
|||||||
@@ -24,16 +24,15 @@ class EnvConfig:
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def env_config() -> EnvConfig:
|
def env_config() -> EnvConfig:
|
||||||
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
|
return EnvConfig(
|
||||||
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
|
vllm_version=os.getenv("VLLM_VERSION", "unknown"),
|
||||||
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
|
vllm_commit=os.getenv("VLLM_COMMIT", "unknown"),
|
||||||
'unknown'),
|
vllm_ascend_version=os.getenv("VLLM_ASCEND_VERSION", "unknown"),
|
||||||
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
|
vllm_ascend_commit=os.getenv("VLLM_ASCEND_COMMIT", "unknown"),
|
||||||
'unknown'),
|
cann_version=os.getenv("CANN_VERSION", "unknown"),
|
||||||
cann_version=os.getenv('CANN_VERSION', 'unknown'),
|
torch_version=os.getenv("TORCH_VERSION", "unknown"),
|
||||||
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
|
torch_npu_version=os.getenv("TORCH_NPU_VERSION", "unknown"),
|
||||||
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
|
)
|
||||||
'unknown'))
|
|
||||||
|
|
||||||
|
|
||||||
def build_model_args(eval_config, tp_size):
|
def build_model_args(eval_config, tp_size):
|
||||||
@@ -48,9 +47,13 @@ def build_model_args(eval_config, tp_size):
|
|||||||
"max_model_len": max_model_len,
|
"max_model_len": max_model_len,
|
||||||
}
|
}
|
||||||
for s in [
|
for s in [
|
||||||
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
|
"max_images",
|
||||||
"tensor_parallel_size", "enforce_eager", "enable_thinking",
|
"gpu_memory_utilization",
|
||||||
"quantization"
|
"enable_expert_parallel",
|
||||||
|
"tensor_parallel_size",
|
||||||
|
"enforce_eager",
|
||||||
|
"enable_thinking",
|
||||||
|
"quantization",
|
||||||
]:
|
]:
|
||||||
val = eval_config.get(s, None)
|
val = eval_config.get(s, None)
|
||||||
if val is not None:
|
if val is not None:
|
||||||
@@ -68,7 +71,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
|||||||
model_args = build_model_args(eval_config, tp_size)
|
model_args = build_model_args(eval_config, tp_size)
|
||||||
|
|
||||||
parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
|
parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
|
||||||
if model_args.get('enable_expert_parallel', False):
|
if model_args.get("enable_expert_parallel", False):
|
||||||
parallel_mode += " + EP"
|
parallel_mode += " + EP"
|
||||||
|
|
||||||
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
|
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
|
||||||
@@ -93,17 +96,16 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
|||||||
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||||
rows=report_data["rows"],
|
rows=report_data["rows"],
|
||||||
parallel_mode=parallel_mode,
|
parallel_mode=parallel_mode,
|
||||||
execution_model=execution_model)
|
execution_model=execution_model,
|
||||||
|
)
|
||||||
|
|
||||||
report_output = os.path.join(
|
report_output = os.path.join(report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
||||||
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
|
||||||
os.makedirs(os.path.dirname(report_output), exist_ok=True)
|
os.makedirs(os.path.dirname(report_output), exist_ok=True)
|
||||||
with open(report_output, 'w', encoding='utf-8') as f:
|
with open(report_output, "w", encoding="utf-8") as f:
|
||||||
f.write(report_content)
|
f.write(report_content)
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
def test_lm_eval_correctness_param(config_filename, tp_size, report_dir, env_config):
|
||||||
env_config):
|
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||||
model_args = build_model_args(eval_config, tp_size)
|
model_args = build_model_args(eval_config, tp_size)
|
||||||
success = True
|
success = True
|
||||||
@@ -135,25 +137,26 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
|||||||
metric_name = metric["name"]
|
metric_name = metric["name"]
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = round(task_result[metric_name], 4)
|
measured_value = round(task_result[metric_name], 4)
|
||||||
task_success = bool(
|
task_success = bool(np.isclose(ground_truth, measured_value, rtol=RTOL))
|
||||||
np.isclose(ground_truth, measured_value, rtol=RTOL))
|
|
||||||
success = success and task_success
|
success = success and task_success
|
||||||
|
|
||||||
print(f"{task_name} | {metric_name}: "
|
print(
|
||||||
f"ground_truth={ground_truth} | measured={measured_value} | "
|
f"{task_name} | {metric_name}: "
|
||||||
f"success={'✅' if task_success else '❌'}")
|
f"ground_truth={ground_truth} | measured={measured_value} | "
|
||||||
|
f"success={'✅' if task_success else '❌'}"
|
||||||
|
)
|
||||||
|
|
||||||
report_data["rows"].append({
|
report_data["rows"].append(
|
||||||
"task":
|
{
|
||||||
task_name,
|
"task": task_name,
|
||||||
"metric":
|
"metric": metric_name,
|
||||||
metric_name,
|
"value": f"✅{measured_value}" if success else f"❌{measured_value}",
|
||||||
"value":
|
"stderr": task_result[
|
||||||
f"✅{measured_value}" if success else f"❌{measured_value}",
|
metric_name.replace(",", "_stderr,")
|
||||||
"stderr":
|
if metric_name == "acc,none"
|
||||||
task_result[
|
else metric_name.replace(",", "_stderr,")
|
||||||
metric_name.replace(',', '_stderr,') if metric_name ==
|
],
|
||||||
"acc,none" else metric_name.replace(',', '_stderr,')]
|
}
|
||||||
})
|
)
|
||||||
generate_report(tp_size, eval_config, report_data, report_dir, env_config)
|
generate_report(tp_size, eval_config, report_data, report_dir, env_config)
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@@ -18,15 +18,12 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
from typing import Any, Union
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.config import CompilationConfig
|
from vllm.config import CompilationConfig
|
||||||
from vllm.v1.metrics.reader import Counter, Vector
|
from vllm.v1.metrics.reader import Counter, Vector
|
||||||
|
|
||||||
@@ -101,7 +98,8 @@ def test_eagle3_sp_acceptance(
|
|||||||
[prompt],
|
[prompt],
|
||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
) for prompt in prompts
|
)
|
||||||
|
for prompt in prompts
|
||||||
]
|
]
|
||||||
|
|
||||||
speculative_config = {
|
speculative_config = {
|
||||||
@@ -112,21 +110,20 @@ def test_eagle3_sp_acceptance(
|
|||||||
"model": spec_model_name,
|
"model": spec_model_name,
|
||||||
}
|
}
|
||||||
|
|
||||||
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",
|
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", cudagraph_capture_sizes=[12])
|
||||||
cudagraph_capture_sizes=[12])
|
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
main_model_name,
|
main_model_name,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
max_num_seqs=256,
|
max_num_seqs=256,
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
compilation_config=compilation_config,
|
compilation_config=compilation_config,
|
||||||
async_scheduling=async_scheduling,
|
async_scheduling=async_scheduling,
|
||||||
) as llm:
|
) as llm:
|
||||||
_ = llm.generate(prompts, sampling_params)
|
_ = llm.generate(prompts, sampling_params)
|
||||||
metrics = llm.model.get_metrics()
|
metrics = llm.model.get_metrics()
|
||||||
@@ -142,10 +139,7 @@ def test_eagle3_sp_acceptance(
|
|||||||
for pos in range(len(metric.values)):
|
for pos in range(len(metric.values)):
|
||||||
num_accepted_tokens_per_pos[pos] += metric.values[pos]
|
num_accepted_tokens_per_pos[pos] += metric.values[pos]
|
||||||
|
|
||||||
acceptance_per_pos = [
|
acceptance_per_pos = [num_accepted_tokens / num_drafts for num_accepted_tokens in num_accepted_tokens_per_pos]
|
||||||
num_accepted_tokens / num_drafts
|
|
||||||
for num_accepted_tokens in num_accepted_tokens_per_pos
|
|
||||||
]
|
|
||||||
golden = BASELINES_SP[method]
|
golden = BASELINES_SP[method]
|
||||||
|
|
||||||
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
|
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
from vllm.utils.network_utils import get_open_port
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
|
||||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||||
|
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
# Offline data parallel mode will be not supported/useful for dense models
|
# Offline data parallel mode will be not supported/useful for dense models
|
||||||
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
|
|||||||
]
|
]
|
||||||
|
|
||||||
for cls, method, counter in hooks:
|
for cls, method, counter in hooks:
|
||||||
stack.enter_context(
|
stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))
|
||||||
patch.object(cls, method, make_spy(cls, method, counter)))
|
|
||||||
|
|
||||||
return stack
|
return stack
|
||||||
|
|
||||||
@@ -75,18 +74,19 @@ def _run_worker_process(
|
|||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
):
|
):
|
||||||
"""Main entry point for the worker process."""
|
"""Main entry point for the worker process."""
|
||||||
os.environ.update({
|
os.environ.update(
|
||||||
"VLLM_DP_RANK": str(rank),
|
{
|
||||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
"VLLM_DP_RANK": str(rank),
|
||||||
"VLLM_DP_SIZE": str(world_size),
|
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||||
"VLLM_DP_MASTER_IP": master_ip,
|
"VLLM_DP_SIZE": str(world_size),
|
||||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
"VLLM_DP_MASTER_IP": master_ip,
|
||||||
})
|
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Import vLLM only after environment setup
|
# Import vLLM only after environment setup
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.distributed.parallel_state import (
|
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
|
||||||
destroy_distributed_environment, destroy_model_parallel)
|
|
||||||
|
|
||||||
# Apply hooks and run inference
|
# Apply hooks and run inference
|
||||||
with _install_spies(counters):
|
with _install_spies(counters):
|
||||||
@@ -100,23 +100,20 @@ def _run_worker_process(
|
|||||||
# Simple data sharding
|
# Simple data sharding
|
||||||
chunk_size = len(prompts) // world_size
|
chunk_size = len(prompts) // world_size
|
||||||
start_idx = rank * chunk_size
|
start_idx = rank * chunk_size
|
||||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
|
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
|
||||||
prompts)
|
|
||||||
local_prompts = prompts[start_idx:end_idx]
|
local_prompts = prompts[start_idx:end_idx]
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=model_path,
|
model=model_path,
|
||||||
quantization="ascend" if "W8A8" in model_path else None,
|
quantization="ascend" if "W8A8" in model_path else None,
|
||||||
enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
enable_expert_parallel="DeepSeek" in model_path,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Expose model config to the main test process
|
# Expose model config to the main test process
|
||||||
counters["hidden_layers"].value = (
|
counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers
|
||||||
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
|
|
||||||
|
|
||||||
llm.generate(local_prompts,
|
llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||||
SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
|
||||||
|
|
||||||
# Explicit cleanup is mandatory in multi-process vLLM tests
|
# Explicit cleanup is mandatory in multi-process vLLM tests
|
||||||
del llm
|
del llm
|
||||||
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
|||||||
for rank in range(dp_size):
|
for rank in range(dp_size):
|
||||||
p = multiprocessing.Process(
|
p = multiprocessing.Process(
|
||||||
target=_run_worker_process,
|
target=_run_worker_process,
|
||||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
|
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
|
||||||
max_tokens),
|
|
||||||
)
|
)
|
||||||
p.start()
|
p.start()
|
||||||
workers.append(p)
|
workers.append(p)
|
||||||
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
|||||||
for k in workers:
|
for k in workers:
|
||||||
if k.is_alive():
|
if k.is_alive():
|
||||||
k.kill()
|
k.kill()
|
||||||
raise RuntimeError(
|
raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||||
f"Worker {p.pid} failed with exit code {p.exitcode}")
|
|
||||||
|
|
||||||
actual_capture = counters["capture"].value
|
actual_capture = counters["capture"].value
|
||||||
actual_replay = counters["replay"].value
|
actual_replay = counters["replay"].value
|
||||||
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
|||||||
num_layers = counters["hidden_layers"].value
|
num_layers = counters["hidden_layers"].value
|
||||||
|
|
||||||
num_acl_graphs = num_layers + 1
|
num_acl_graphs = num_layers + 1
|
||||||
num_comm_groups = sum(1 for s in [dp_size, 1]
|
num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1) # dp_size=2, tp_size=1
|
||||||
if s > 1) # dp_size=2, tp_size=1
|
|
||||||
|
|
||||||
# Metric 1: Graph Capture (ACL Graph Construction)
|
# Metric 1: Graph Capture (ACL Graph Construction)
|
||||||
# Ref: vllm_ascend.utils.update_aclgraph_sizes
|
# Ref: vllm_ascend.utils.update_aclgraph_sizes
|
||||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
|
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))
|
||||||
num_acl_graphs / (1 + num_comm_groups * 2))
|
|
||||||
|
|
||||||
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
|
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
|
||||||
assert (
|
assert actual_capture == expected_capture, (
|
||||||
actual_capture == expected_capture
|
f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||||
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
)
|
||||||
|
|
||||||
# Metric 2: Model Execution (NPUModelRunner.execute_model)
|
# Metric 2: Model Execution (NPUModelRunner.execute_model)
|
||||||
# vLLM Step Breakdown:
|
# vLLM Step Breakdown:
|
||||||
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
|||||||
# vllm default enables Async scheduler, this will take 1 more steps
|
# vllm default enables Async scheduler, this will take 1 more steps
|
||||||
expected_exec_model = (total_steps + 1 + 1) * dp_size
|
expected_exec_model = (total_steps + 1 + 1) * dp_size
|
||||||
|
|
||||||
assert (
|
assert num_execute_model == expected_exec_model, (
|
||||||
num_execute_model == expected_exec_model
|
f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||||
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
)
|
||||||
|
|
||||||
# Metric 3: Dummy Runs (Warmup & Alignment)
|
# Metric 3: Dummy Runs (Warmup & Alignment)
|
||||||
# vLLM synchronizes globally every 32 steps.
|
# vLLM synchronizes globally every 32 steps.
|
||||||
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
|||||||
|
|
||||||
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
|
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
|
||||||
|
|
||||||
assert (
|
assert num_dummy_run == expected_dummy_run, (
|
||||||
num_dummy_run == expected_dummy_run
|
f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||||
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
)
|
||||||
|
|
||||||
# Metric 4: Graph Replay (Inference Execution)
|
# Metric 4: Graph Replay (Inference Execution)
|
||||||
# Replays happen for every aligned step across all graphs.
|
# Replays happen for every aligned step across all graphs.
|
||||||
expected_replay = num_acl_graphs * aligned_steps * dp_size
|
expected_replay = num_acl_graphs * aligned_steps * dp_size
|
||||||
|
|
||||||
assert (
|
assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||||
actual_replay == expected_replay
|
|
||||||
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
|
||||||
|
|||||||
@@ -64,12 +64,8 @@ def test_qwen3_inference_dp2(model, max_tokens):
|
|||||||
cmd.append("ascend")
|
cmd.append("ascend")
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd)}")
|
print(f"Running subprocess: {' '.join(cmd)}")
|
||||||
proc = subprocess.run(cmd,
|
proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=600)
|
||||||
env=env,
|
output = proc.stdout.decode(errors="ignore")
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
timeout=600)
|
|
||||||
output = proc.stdout.decode(errors='ignore')
|
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ MODELS = [
|
|||||||
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
||||||
TENSOR_PARALLELS = [1]
|
TENSOR_PARALLELS = [1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||||
@@ -36,36 +37,61 @@ async def test_models(model: str, tp_size: int) -> None:
|
|||||||
vllm_server_args = [
|
vllm_server_args = [
|
||||||
[
|
[
|
||||||
"--port",
|
"--port",
|
||||||
str(encode_port), "--model", model, "--gpu-memory-utilization",
|
str(encode_port),
|
||||||
"0.01", "--tensor-parallel-size",
|
"--model",
|
||||||
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
|
model,
|
||||||
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
|
"--gpu-memory-utilization",
|
||||||
"--max-num-seqs", "1", "--ec-transfer-config",
|
"0.01",
|
||||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
"--tensor-parallel-size",
|
||||||
SHARED_STORAGE_PATH +
|
str(tp_size),
|
||||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
"--enforce-eager",
|
||||||
|
"--no-enable-prefix-caching",
|
||||||
|
"--max-model-len",
|
||||||
|
"10000",
|
||||||
|
"--max-num-batched-tokens",
|
||||||
|
"10000",
|
||||||
|
"--max-num-seqs",
|
||||||
|
"1",
|
||||||
|
"--ec-transfer-config",
|
||||||
|
'{"ec_connector_extra_config":{"shared_storage_path":"'
|
||||||
|
+ SHARED_STORAGE_PATH
|
||||||
|
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}',
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"--port",
|
"--port",
|
||||||
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
|
str(pd_port),
|
||||||
|
"--model",
|
||||||
|
model,
|
||||||
|
"--gpu-memory-utilization",
|
||||||
|
"0.95",
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
|
str(tp_size),
|
||||||
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
|
"--enforce-eager",
|
||||||
|
"--max-model-len",
|
||||||
|
"10000",
|
||||||
|
"--max-num-batched-tokens",
|
||||||
|
"10000",
|
||||||
|
"--max-num-seqs",
|
||||||
|
"128",
|
||||||
"--ec-transfer-config",
|
"--ec-transfer-config",
|
||||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
'{"ec_connector_extra_config":{"shared_storage_path":"'
|
||||||
SHARED_STORAGE_PATH +
|
+ SHARED_STORAGE_PATH
|
||||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}',
|
||||||
]
|
],
|
||||||
]
|
]
|
||||||
proxy_port = get_open_port()
|
proxy_port = get_open_port()
|
||||||
proxy_args = [
|
proxy_args = [
|
||||||
"--host", "127.0.0.1", "--port",
|
"--host",
|
||||||
str(proxy_port), "--encode-servers-urls",
|
"127.0.0.1",
|
||||||
f"http://localhost:{encode_port}", "--decode-servers-urls",
|
"--port",
|
||||||
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
|
str(proxy_port),
|
||||||
|
"--encode-servers-urls",
|
||||||
|
f"http://localhost:{encode_port}",
|
||||||
|
"--decode-servers-urls",
|
||||||
|
f"http://localhost:{pd_port}",
|
||||||
|
"--prefill-servers-urls",
|
||||||
|
"disable",
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
|
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _, DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
||||||
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
send_image_request(model, proxy)
|
||||||
send_image_request(model, proxy)
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,15 +15,12 @@ def test_deepseek_correctness_ep(model_name):
|
|||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||||
with VllmRunner(model_name,
|
with VllmRunner(model_name, cudagraph_capture_sizes=[1, 2, 4, 8], tensor_parallel_size=2) as vllm_model:
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
||||||
tensor_parallel_size=2) as vllm_model:
|
|
||||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
with VllmRunner(model_name,
|
with VllmRunner(
|
||||||
tensor_parallel_size=2,
|
model_name, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], enable_expert_parallel=True
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
) as vllm_model:
|
||||||
enable_expert_parallel=True) as vllm_model:
|
|
||||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from unittest.mock import patch
|
|||||||
import pytest
|
import pytest
|
||||||
import torch_npu
|
import torch_npu
|
||||||
from modelscope import snapshot_download # type: ignore
|
from modelscope import snapshot_download # type: ignore
|
||||||
|
|
||||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||||
|
|
||||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||||
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
|||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
|
||||||
def test_qwen3_external_launcher(model):
|
def test_qwen3_external_launcher(model):
|
||||||
script = Path(
|
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
__file__
|
|
||||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
# TODO: Change to 2 when ci machine has 4 cards
|
||||||
cmd = [
|
cmd = [
|
||||||
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
|
|||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
timeout=600,
|
timeout=600,
|
||||||
)
|
)
|
||||||
output = proc.stdout.decode(errors='ignore')
|
output = proc.stdout.decode(errors="ignore")
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
|
|||||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||||
@wait_until_npu_memory_free()
|
@wait_until_npu_memory_free()
|
||||||
def test_qwen3_moe_external_launcher_ep_tp2(model):
|
def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||||
script = Path(
|
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
__file__
|
|
||||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
# TODO: Change to 2 when ci machine has 4 cards
|
||||||
cmd = [
|
cmd = [
|
||||||
sys.executable,
|
sys.executable,
|
||||||
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
|
str(script),
|
||||||
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
|
"--model",
|
||||||
"--enable-expert-parallel"
|
model,
|
||||||
|
"--tp-size",
|
||||||
|
"2",
|
||||||
|
"--node-size",
|
||||||
|
"1",
|
||||||
|
"--node-rank",
|
||||||
|
"0",
|
||||||
|
"--proc-per-node",
|
||||||
|
"2",
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--enable-expert-parallel",
|
||||||
]
|
]
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd)}")
|
print(f"Running subprocess: {' '.join(cmd)}")
|
||||||
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
|||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
timeout=600,
|
timeout=600,
|
||||||
)
|
)
|
||||||
output = proc.stdout.decode(errors='ignore')
|
output = proc.stdout.decode(errors="ignore")
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
|||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||||
@wait_until_npu_memory_free()
|
@wait_until_npu_memory_free()
|
||||||
def test_qwen3_external_launcher_with_sleepmode():
|
def test_qwen3_external_launcher_with_sleepmode():
|
||||||
script = Path(
|
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
__file__
|
|
||||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
# TODO: Change to 2 when ci machine has 4 cards
|
||||||
cmd = [
|
cmd = [
|
||||||
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
|||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
timeout=300,
|
timeout=300,
|
||||||
)
|
)
|
||||||
output = proc.stdout.decode(errors='ignore')
|
output = proc.stdout.decode(errors="ignore")
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
|||||||
|
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||||
def test_qwen3_external_launcher_with_sleepmode_level2():
|
def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||||
script = Path(
|
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
__file__
|
|
||||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
model_path = snapshot_download("Qwen/Qwen3-8B")
|
model_path = snapshot_download("Qwen/Qwen3-8B")
|
||||||
# TODO: Add moe model test
|
# TODO: Add moe model test
|
||||||
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
|||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
timeout=300,
|
timeout=300,
|
||||||
)
|
)
|
||||||
output = proc.stdout.decode(errors='ignore')
|
output = proc.stdout.decode(errors="ignore")
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
|||||||
)
|
)
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@wait_until_npu_memory_free()
|
@wait_until_npu_memory_free()
|
||||||
@patch.dict(os.environ, {
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
|
||||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
|
|
||||||
"HCCL_BUFFSIZE": "500"
|
|
||||||
})
|
|
||||||
def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||||
script = Path(
|
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
__file__
|
|
||||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
cmd = [
|
cmd = [
|
||||||
sys.executable,
|
sys.executable,
|
||||||
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
|||||||
timeout=600,
|
timeout=600,
|
||||||
)
|
)
|
||||||
|
|
||||||
output = proc.stdout.decode(errors='ignore')
|
output = proc.stdout.decode(errors="ignore")
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
assert "Generated text:" in output
|
assert "Generated text:" in output
|
||||||
|
|||||||
@@ -26,41 +26,39 @@ from tests.e2e.model_utils import check_outputs_equal
|
|||||||
|
|
||||||
|
|
||||||
def test_qwen3_moe_full_decode_only_tp2():
|
def test_qwen3_moe_full_decode_only_tp2():
|
||||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is", "The president of the United States is",
|
"Hello, my name is",
|
||||||
"The capital of France is", "The future of AI is"
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
]
|
]
|
||||||
model = "Qwen/Qwen3-30B-A3B"
|
model = "Qwen/Qwen3-30B-A3B"
|
||||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||||
with VllmRunner(model,
|
with VllmRunner(
|
||||||
max_model_len=1024,
|
model,
|
||||||
tensor_parallel_size=2,
|
max_model_len=1024,
|
||||||
compilation_config={
|
tensor_parallel_size=2,
|
||||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
|
||||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
) as runner:
|
||||||
}) as runner:
|
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
|
||||||
sampling_params)
|
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
) as runner:
|
) as runner:
|
||||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
vllm_fullgraph_outputs_list = []
|
vllm_fullgraph_outputs_list = []
|
||||||
for output in vllm_fullgraph_outputs:
|
for output in vllm_fullgraph_outputs:
|
||||||
vllm_fullgraph_outputs_list.append(
|
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
vllm_eager_outputs_list = []
|
vllm_eager_outputs_list = []
|
||||||
for output in vllm_eager_outputs:
|
for output in vllm_eager_outputs:
|
||||||
vllm_eager_outputs_list.append(
|
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=vllm_eager_outputs_list,
|
outputs_0_lst=vllm_eager_outputs_list,
|
||||||
@@ -72,41 +70,39 @@ def test_qwen3_moe_full_decode_only_tp2():
|
|||||||
|
|
||||||
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
|
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
|
||||||
def test_qwen3_moe_full_graph_tp2():
|
def test_qwen3_moe_full_graph_tp2():
|
||||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is", "The president of the United States is",
|
"Hello, my name is",
|
||||||
"The capital of France is", "The future of AI is"
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
]
|
]
|
||||||
model = "Qwen/Qwen3-30B-A3B"
|
model = "Qwen/Qwen3-30B-A3B"
|
||||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||||
with VllmRunner(model,
|
with VllmRunner(
|
||||||
max_model_len=1024,
|
model,
|
||||||
tensor_parallel_size=2,
|
max_model_len=1024,
|
||||||
compilation_config={
|
tensor_parallel_size=2,
|
||||||
"cudagraph_mode": "FULL",
|
compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
|
||||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
) as runner:
|
||||||
}) as runner:
|
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
|
||||||
sampling_params)
|
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
) as runner:
|
) as runner:
|
||||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
vllm_fullgraph_outputs_list = []
|
vllm_fullgraph_outputs_list = []
|
||||||
for output in vllm_fullgraph_outputs:
|
for output in vllm_fullgraph_outputs:
|
||||||
vllm_fullgraph_outputs_list.append(
|
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
vllm_eager_outputs_list = []
|
vllm_eager_outputs_list = []
|
||||||
for output in vllm_eager_outputs:
|
for output in vllm_eager_outputs:
|
||||||
vllm_eager_outputs_list.append(
|
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=vllm_eager_outputs_list,
|
outputs_0_lst=vllm_eager_outputs_list,
|
||||||
|
|||||||
@@ -1,23 +1,22 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
from tests.e2e.conftest import VllmRunner
|
||||||
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
from tests.e2e.singlecard.test_ilama_lora import EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample
|
||||||
MODEL_PATH, do_sample)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
||||||
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_loras=4,
|
max_loras=4,
|
||||||
dtype="half",
|
dtype="half",
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
||||||
|
|
||||||
|
|||||||
@@ -20,8 +20,10 @@
|
|||||||
|
|
||||||
Run `pytest tests/test_offline_inference.py`.
|
Run `pytest tests/test_offline_inference.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
|
|||||||
"unsloth/gpt-oss-20b-BF16",
|
"unsloth/gpt-oss-20b-BF16",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_deepseek_multistream_moe_tp2():
|
def test_deepseek_multistream_moe_tp2():
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@@ -58,15 +61,15 @@ def test_deepseek_multistream_moe_tp2():
|
|||||||
dtype = "half"
|
dtype = "half"
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
additional_config={
|
additional_config={
|
||||||
"enable_multistream_moe": True,
|
"enable_multistream_moe": True,
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
},
|
},
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -78,12 +81,12 @@ def test_qwen3_w4a8_dynamic_tp2(model):
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(prompts, max_tokens)
|
vllm_model.generate_greedy(prompts, max_tokens)
|
||||||
|
|
||||||
@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
|
|||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(max_tokens=5,
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||||
temperature=0.0,
|
|
||||||
top_k=50,
|
|
||||||
top_p=0.9)
|
|
||||||
|
|
||||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
with VllmRunner(
|
||||||
dtype="auto",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
tensor_parallel_size=2,
|
dtype="auto",
|
||||||
distributed_executor_backend="mp",
|
tensor_parallel_size=2,
|
||||||
compilation_config={"pass_config": {
|
distributed_executor_backend="mp",
|
||||||
"enable_sp": True
|
compilation_config={"pass_config": {"enable_sp": True}},
|
||||||
}},
|
enable_expert_parallel=True,
|
||||||
enable_expert_parallel=True,
|
enforce_eager=True,
|
||||||
enforce_eager=True) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
|
|||||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
||||||
def test_deepseek_w4a8_accuracy_tp2(model):
|
def test_deepseek_w4a8_accuracy_tp2(model):
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is", "The president of the United States is",
|
"Hello, my name is",
|
||||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
|
"The president of the United States is",
|
||||||
]
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
|
||||||
vllm_ds_w4a8_answers = [
|
|
||||||
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
|
|
||||||
]
|
]
|
||||||
|
vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
|
||||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
||||||
with VllmRunner(model,
|
with VllmRunner(
|
||||||
dtype="auto",
|
model,
|
||||||
tensor_parallel_size=2,
|
dtype="auto",
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
tensor_parallel_size=2,
|
||||||
quantization="ascend",
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
enable_expert_parallel=True) as vllm_model:
|
quantization="ascend",
|
||||||
vllm_quant_outputs = vllm_model.model.generate(prompts,
|
enable_expert_parallel=True,
|
||||||
sampling_params)
|
) as vllm_model:
|
||||||
|
vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
vllm_quant_outputs_list = []
|
vllm_quant_outputs_list = []
|
||||||
for output in vllm_quant_outputs:
|
for output in vllm_quant_outputs:
|
||||||
vllm_quant_outputs_list.append(
|
vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
|
||||||
([output.outputs[0].index], output.outputs[0].text))
|
|
||||||
vllm_answer_list = []
|
vllm_answer_list = []
|
||||||
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
|
vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]
|
||||||
|
|
||||||
check_outputs_equal(outputs_0_lst=vllm_answer_list,
|
check_outputs_equal(
|
||||||
outputs_1_lst=vllm_quant_outputs_list,
|
outputs_0_lst=vllm_answer_list,
|
||||||
name_0="vllm_quant_outputs",
|
outputs_1_lst=vllm_quant_outputs_list,
|
||||||
name_1="vllm_answer_outputs")
|
name_0="vllm_quant_outputs",
|
||||||
|
name_1="vllm_answer_outputs",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||||
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
|
|||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(max_tokens=5,
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||||
temperature=0.0,
|
|
||||||
top_k=50,
|
|
||||||
top_p=0.9)
|
|
||||||
|
|
||||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
with VllmRunner(
|
||||||
dtype="auto",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
tensor_parallel_size=2,
|
dtype="auto",
|
||||||
distributed_executor_backend="mp",
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
distributed_executor_backend="mp",
|
||||||
enforce_eager=True) as vllm_model:
|
enable_expert_parallel=True,
|
||||||
|
enforce_eager=True,
|
||||||
|
) as vllm_model:
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@@ -168,20 +168,17 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
|
|||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(max_tokens=5,
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||||
temperature=0.0,
|
|
||||||
top_k=50,
|
|
||||||
top_p=0.9)
|
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"Qwen/Qwen3-30B-A3B",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
enforce_eager=
|
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
||||||
True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
additional_config={"layer_sharding": ["o_proj"]},
|
||||||
additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
|
|||||||
example_prompts = [
|
example_prompts = [
|
||||||
"test" * 1001,
|
"test" * 1001,
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(max_tokens=5,
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||||
temperature=0.0,
|
with VllmRunner(
|
||||||
top_k=50,
|
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||||
top_p=0.9)
|
dtype="auto",
|
||||||
with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
tensor_parallel_size=2,
|
||||||
dtype="auto",
|
distributed_executor_backend="mp",
|
||||||
tensor_parallel_size=2,
|
enable_expert_parallel=True,
|
||||||
distributed_executor_backend="mp",
|
enforce_eager=True,
|
||||||
enable_expert_parallel=True,
|
quantization="ascend",
|
||||||
enforce_eager=True,
|
) as vllm_model:
|
||||||
quantization="ascend") as vllm_model:
|
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@@ -213,12 +209,12 @@ def test_qwen3_dense_fc1_tp2(model):
|
|||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -232,13 +228,13 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
|||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
additional_config={"weight_prefetch_config": {"enabled": True}},
|
additional_config={"weight_prefetch_config": {"enabled": True}},
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
|||||||
"Hello ",
|
"Hello ",
|
||||||
]
|
]
|
||||||
# "max_position_embeddings": 163840,
|
# "max_position_embeddings": 163840,
|
||||||
long_example_prompts = [
|
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
|
||||||
"Hello " * (163839 - 500) + "Hello"
|
|
||||||
]
|
|
||||||
max_tokens = 500
|
max_tokens = 500
|
||||||
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
with VllmRunner(
|
||||||
tensor_parallel_size=2,
|
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||||
quantization="ascend",
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
quantization="ascend",
|
||||||
max_model_len=163840,
|
enable_expert_parallel=True,
|
||||||
compilation_config={
|
max_model_len=163840,
|
||||||
"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12],
|
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
||||||
},
|
additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
|
||||||
speculative_config={
|
reasoning_parser="deepseek_v3",
|
||||||
"num_speculative_tokens": 1,
|
tokenizer_mode="deepseek_v32",
|
||||||
"method": "deepseek_mtp"
|
) as vllm_model:
|
||||||
},
|
|
||||||
additional_config={
|
|
||||||
"layer_sharding":["q_b_proj", "o_proj"]
|
|
||||||
},
|
|
||||||
reasoning_parser="deepseek_v3",
|
|
||||||
tokenizer_mode="deepseek_v32") as vllm_model:
|
|
||||||
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
||||||
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -285,10 +273,10 @@ def test_qwen3_w4a4_distributed_tp2(model):
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -300,8 +288,8 @@ def test_gpt_oss_distributed_tp2(model):
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|||||||
@@ -32,9 +32,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
|
|||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||||
def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
||||||
script = Path(
|
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
__file__
|
|
||||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
cmd = [
|
cmd = [
|
||||||
sys.executable,
|
sys.executable,
|
||||||
@@ -65,7 +63,7 @@ def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
|||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
timeout=600,
|
timeout=600,
|
||||||
)
|
)
|
||||||
output = proc.stdout.decode(errors='ignore')
|
output = proc.stdout.decode(errors="ignore")
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
|
|||||||
@@ -37,12 +37,13 @@ prompts = [
|
|||||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||||
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
|
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
|
||||||
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
|
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
|
||||||
def test_models_pp2(model: str, tp_size: int, pp_size: int,
|
def test_models_pp2(model: str, tp_size: int, pp_size: int, distributed_executor_backend: str) -> None:
|
||||||
distributed_executor_backend: str) -> None:
|
with VllmRunner(
|
||||||
with VllmRunner(model,
|
model,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
pipeline_parallel_size=pp_size,
|
pipeline_parallel_size=pp_size,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7,
|
||||||
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(prompts, 64)
|
vllm_model.generate_greedy(prompts, 64)
|
||||||
|
|||||||
@@ -11,11 +11,14 @@ MODELS = [
|
|||||||
# for MHA
|
# for MHA
|
||||||
"Qwen/Qwen3-8B",
|
"Qwen/Qwen3-8B",
|
||||||
# for MLA
|
# for MLA
|
||||||
"deepseek-ai/DeepSeek-V2-Lite-Chat"
|
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||||
]
|
]
|
||||||
|
|
||||||
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
|
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
|
||||||
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
|
# ruff: noqa: E501
|
||||||
|
LONG_PROMPT = (
|
||||||
|
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
|
||||||
|
+ """
|
||||||
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|
||||||
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
|
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
|
||||||
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
|
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
|
||||||
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
|
|||||||
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
|
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
|
||||||
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
|
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
|
||||||
"""
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
INPUT_PROMPTS = [
|
INPUT_PROMPTS = [
|
||||||
LONG_PROMPT +
|
LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
||||||
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
|
||||||
LONG_PROMPT +
|
|
||||||
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("max_tokens", [50])
|
@pytest.mark.parametrize("max_tokens", [50])
|
||||||
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
|
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
|
||||||
with VllmRunner(model,
|
with VllmRunner(
|
||||||
max_model_len=2048,
|
model,
|
||||||
tensor_parallel_size=2,
|
max_model_len=2048,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
tensor_parallel_size=2,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
prefix_cache_output = vllm_model.generate_greedy(
|
gpu_memory_utilization=0.7,
|
||||||
INPUT_PROMPTS, max_tokens)
|
) as vllm_model:
|
||||||
|
prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||||
|
|
||||||
with VllmRunner(model,
|
with VllmRunner(
|
||||||
enable_prefix_caching=False,
|
model,
|
||||||
max_model_len=2048,
|
enable_prefix_caching=False,
|
||||||
tensor_parallel_size=2,
|
max_model_len=2048,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
tensor_parallel_size=2,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
|
gpu_memory_utilization=0.7,
|
||||||
|
) as vllm_model:
|
||||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||||
#
|
#
|
||||||
import pytest
|
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
from tests.e2e.conftest import VllmRunner
|
||||||
|
|
||||||
@@ -27,16 +26,16 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
golden_results = [
|
golden_results = [
|
||||||
'The president of the United States is the head of state and',
|
"The president of the United States is the head of state and",
|
||||||
]
|
]
|
||||||
|
|
||||||
for i in range(len(vllm_output)):
|
for i in range(len(vllm_output)):
|
||||||
@@ -50,36 +49,37 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
golden_results = [
|
golden_results = [
|
||||||
'The president of the United States is the head of state and',
|
"The president of the United States is the head of state and",
|
||||||
]
|
]
|
||||||
|
|
||||||
for i in range(len(vllm_output)):
|
for i in range(len(vllm_output)):
|
||||||
assert golden_results[i] == vllm_output[i][1]
|
assert golden_results[i] == vllm_output[i][1]
|
||||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||||
|
|
||||||
|
|
||||||
def test_qwen3_moe_w4a8_dynamic_llm_compressor():
|
def test_qwen3_moe_w4a8_dynamic_llm_compressor():
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"The president of the United States is",
|
"The president of the United States is",
|
||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
golden_results = [
|
golden_results = [
|
||||||
'The president of the United States is the head of state and',
|
"The president of the United States is the head of state and",
|
||||||
]
|
]
|
||||||
|
|
||||||
for i in range(len(vllm_output)):
|
for i in range(len(vllm_output)):
|
||||||
|
|||||||
@@ -34,11 +34,11 @@ def test_qwen3_moe_distributed_mp_tp2_ep():
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"Qwen/Qwen3-30B-A3B",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -49,27 +49,27 @@ def test_qwen3_moe_w8a8_distributed_tp2():
|
|||||||
]
|
]
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
|
|
||||||
def test_qwen3_moe_distributed_aiv_tp2():
|
def test_qwen3_moe_distributed_aiv_tp2():
|
||||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
os.environ["HCCL_OP_EXPANSION_MODE"] = "AIV"
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
]
|
]
|
||||||
dtype = "auto"
|
dtype = "auto"
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"Qwen/Qwen3-30B-A3B",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@@ -80,23 +80,24 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
|||||||
port = get_open_port()
|
port = get_open_port()
|
||||||
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
|
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
|
||||||
server_args = [
|
server_args = [
|
||||||
"--max_model_len", "8192", "--tensor_parallel_size", "2",
|
"--max_model_len",
|
||||||
"--enable_expert_parallel", "--quantization", "ascend", "--port",
|
"8192",
|
||||||
str(port), "--compilation-config", compilation_config
|
"--tensor_parallel_size",
|
||||||
|
"2",
|
||||||
|
"--enable_expert_parallel",
|
||||||
|
"--quantization",
|
||||||
|
"ascend",
|
||||||
|
"--port",
|
||||||
|
str(port),
|
||||||
|
"--compilation-config",
|
||||||
|
compilation_config,
|
||||||
]
|
]
|
||||||
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
||||||
with RemoteOpenAIServer(model,
|
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
|
||||||
server_args,
|
|
||||||
server_port=port,
|
|
||||||
auto_port=False,
|
|
||||||
env_dict=env_dict) as server:
|
|
||||||
client = server.get_async_client()
|
client = server.get_async_client()
|
||||||
batch = await client.completions.create(model=model,
|
batch = await client.completions.create(
|
||||||
prompt="What is deeplearning?",
|
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
|
||||||
max_tokens=400,
|
)
|
||||||
temperature=0,
|
|
||||||
top_p=1.0,
|
|
||||||
n=1)
|
|
||||||
gt_choices: list[openai.types.CompletionChoice] = batch.choices
|
gt_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||||
|
|
||||||
# dynamic eplb test
|
# dynamic eplb test
|
||||||
@@ -108,22 +109,14 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
|||||||
"dynamic_eplb": True,
|
"dynamic_eplb": True,
|
||||||
"expert_heat_collection_interval": 100,
|
"expert_heat_collection_interval": 100,
|
||||||
"algorithm_execution_interval": 20,
|
"algorithm_execution_interval": 20,
|
||||||
"num_redundant_experts": 2
|
"num_redundant_experts": 2,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||||
with RemoteOpenAIServer(model,
|
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
|
||||||
server_args,
|
|
||||||
server_port=port,
|
|
||||||
auto_port=False,
|
|
||||||
env_dict=env_dict) as server:
|
|
||||||
client = server.get_async_client()
|
client = server.get_async_client()
|
||||||
batch = await client.completions.create(model=model,
|
batch = await client.completions.create(
|
||||||
prompt="What is deeplearning?",
|
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
|
||||||
max_tokens=400,
|
)
|
||||||
temperature=0,
|
|
||||||
top_p=1.0,
|
|
||||||
n=1)
|
|
||||||
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
|
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||||
assert gt_choices[0].text == eplb_choices[
|
assert gt_choices[0].text == eplb_choices[0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
||||||
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.sampling_params import RequestOutputKind
|
from vllm.sampling_params import RequestOutputKind
|
||||||
|
|
||||||
|
from tests.e2e.conftest import VllmRunner
|
||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
|
@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
|
||||||
def test_qwen3_moe_routing_replay():
|
def test_qwen3_moe_routing_replay():
|
||||||
@@ -12,18 +13,15 @@ def test_qwen3_moe_routing_replay():
|
|||||||
"Hello, please introduce yourself.",
|
"Hello, please introduce yourself.",
|
||||||
]
|
]
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
"Qwen/Qwen3-30B-A3B",
|
"Qwen/Qwen3-30B-A3B",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
enable_return_routed_experts=True,
|
enable_return_routed_experts=True,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
max_tokens=5,
|
max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
|
||||||
temperature=0.8,
|
|
||||||
top_p=0.95,
|
|
||||||
output_kind=RequestOutputKind.FINAL_ONLY
|
|
||||||
)
|
)
|
||||||
inputs = vllm_model.get_inputs(prompts=prompts)
|
inputs = vllm_model.get_inputs(prompts=prompts)
|
||||||
outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
|
outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
|
||||||
|
|||||||
@@ -84,11 +84,7 @@ async def test_models(model: str) -> None:
|
|||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
}
|
}
|
||||||
with RemoteOpenAIServer(model,
|
with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
|
||||||
server_args,
|
|
||||||
server_port=port,
|
|
||||||
env_dict=env_dict,
|
|
||||||
auto_port=False) as server:
|
|
||||||
client = server.get_async_client()
|
client = server.get_async_client()
|
||||||
batch = await client.completions.create(
|
batch = await client.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
|
|||||||
@@ -13,69 +13,65 @@ MODELS = [
|
|||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
|
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
|
||||||
|
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is", "The capital of the United States is",
|
"Hello, my name is",
|
||||||
"The capital of France is", "The future of AI is"
|
"The capital of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
) as runner:
|
) as runner:
|
||||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
|
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
additional_config={
|
additional_config={
|
||||||
"enable_shared_expert_dp": True,
|
"enable_shared_expert_dp": True,
|
||||||
},
|
},
|
||||||
) as runner:
|
) as runner:
|
||||||
shared_expert_dp_eager_outputs = runner.model.generate(
|
shared_expert_dp_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
prompts, sampling_params)
|
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
compilation_config={
|
compilation_config={
|
||||||
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
||||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||||
},
|
},
|
||||||
additional_config={
|
additional_config={
|
||||||
"enable_shared_expert_dp": True,
|
"enable_shared_expert_dp": True,
|
||||||
},
|
},
|
||||||
) as runner:
|
) as runner:
|
||||||
shared_expert_dp_aclgraph_outputs = runner.model.generate(
|
shared_expert_dp_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
prompts, sampling_params)
|
|
||||||
|
|
||||||
vllm_eager_outputs_list = []
|
vllm_eager_outputs_list = []
|
||||||
for output in vllm_eager_outputs:
|
for output in vllm_eager_outputs:
|
||||||
vllm_eager_outputs_list.append(
|
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
shared_expert_dp_eager_outputs_list = []
|
shared_expert_dp_eager_outputs_list = []
|
||||||
for output in shared_expert_dp_eager_outputs:
|
for output in shared_expert_dp_eager_outputs:
|
||||||
shared_expert_dp_eager_outputs_list.append(
|
shared_expert_dp_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
shared_expert_dp_aclgraph_outputs_list = []
|
shared_expert_dp_aclgraph_outputs_list = []
|
||||||
for output in shared_expert_dp_aclgraph_outputs:
|
for output in shared_expert_dp_aclgraph_outputs:
|
||||||
shared_expert_dp_aclgraph_outputs_list.append(
|
shared_expert_dp_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=vllm_eager_outputs_list,
|
outputs_0_lst=vllm_eager_outputs_list,
|
||||||
|
|||||||
@@ -39,8 +39,7 @@ api_keyword_args = {
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||||
async def test_models_single_request_aclgraph_dp2(model: str,
|
async def test_models_single_request_aclgraph_dp2(model: str, dp_size: int) -> None:
|
||||||
dp_size: int) -> None:
|
|
||||||
port = get_open_port()
|
port = get_open_port()
|
||||||
env_dict = {
|
env_dict = {
|
||||||
"TASK_QUEUE_ENABLE": "1",
|
"TASK_QUEUE_ENABLE": "1",
|
||||||
@@ -48,36 +47,51 @@ async def test_models_single_request_aclgraph_dp2(model: str,
|
|||||||
}
|
}
|
||||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||||
server_args = [
|
server_args = [
|
||||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
"--no-enable-prefix-caching",
|
||||||
|
"--tensor-parallel-size",
|
||||||
|
"1",
|
||||||
"--data-parallel-size",
|
"--data-parallel-size",
|
||||||
str(dp_size), "--quantization", "ascend", "--max-model-len",
|
str(dp_size),
|
||||||
"1024", "--port",
|
"--quantization",
|
||||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
"ascend",
|
||||||
|
"--max-model-len",
|
||||||
|
"1024",
|
||||||
|
"--port",
|
||||||
|
str(port),
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--gpu-memory-utilization",
|
||||||
|
"0.9",
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
server_args = [
|
server_args = [
|
||||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
"--no-enable-prefix-caching",
|
||||||
|
"--tensor-parallel-size",
|
||||||
|
"1",
|
||||||
"--data-parallel-size",
|
"--data-parallel-size",
|
||||||
str(dp_size), "--port",
|
str(dp_size),
|
||||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
"--port",
|
||||||
|
str(port),
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--gpu-memory-utilization",
|
||||||
|
"0.9",
|
||||||
]
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
}
|
}
|
||||||
with RemoteOpenAIServer(model,
|
with RemoteOpenAIServer(
|
||||||
vllm_serve_args=server_args,
|
model, vllm_serve_args=server_args, server_port=port, env_dict=env_dict, auto_port=False
|
||||||
server_port=port,
|
) as server:
|
||||||
env_dict=env_dict,
|
|
||||||
auto_port=False) as server:
|
|
||||||
client = server.get_async_client()
|
client = server.get_async_client()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
batch = await asyncio.wait_for(client.completions.create(
|
batch = await asyncio.wait_for(
|
||||||
model=model,
|
client.completions.create(
|
||||||
prompt=prompts,
|
model=model,
|
||||||
**request_keyword_args,
|
prompt=prompts,
|
||||||
),
|
**request_keyword_args,
|
||||||
timeout=10.0)
|
),
|
||||||
|
timeout=10.0,
|
||||||
|
)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
pytest.fail("Model did not return response within 10 seconds")
|
pytest.fail("Model did not return response within 10 seconds")
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
@@ -14,47 +12,46 @@ MODELS = [
|
|||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
def test_qwen3_vl_sp_tp2(model: str) -> None:
|
def test_qwen3_vl_sp_tp2(model: str) -> None:
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is", "The capital of the United States is",
|
"Hello, my name is",
|
||||||
"The capital of France is", "The future of AI is"
|
"The capital of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
|
sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
compilation_config={
|
compilation_config={
|
||||||
"cudagraph_capture_sizes": [2, 4],
|
"cudagraph_capture_sizes": [2, 4],
|
||||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||||
"pass_config": {"enable_sp": False}
|
"pass_config": {"enable_sp": False},
|
||||||
},
|
},
|
||||||
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}}
|
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||||||
) as runner:
|
) as runner:
|
||||||
no_sp_outputs = runner.model.generate(prompts, sampling_params)
|
no_sp_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
compilation_config={
|
compilation_config={
|
||||||
"cudagraph_capture_sizes": [2, 4],
|
"cudagraph_capture_sizes": [2, 4],
|
||||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||||
"pass_config": {"enable_sp": True}
|
"pass_config": {"enable_sp": True},
|
||||||
},
|
},
|
||||||
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}}
|
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||||||
) as runner:
|
) as runner:
|
||||||
sp_outputs = runner.model.generate(
|
sp_outputs = runner.model.generate(prompts, sampling_params)
|
||||||
prompts, sampling_params)
|
|
||||||
|
|
||||||
no_sp_outputs_list = []
|
no_sp_outputs_list = []
|
||||||
for output in no_sp_outputs:
|
for output in no_sp_outputs:
|
||||||
no_sp_outputs_list.append(
|
no_sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
sp_outputs_list = []
|
sp_outputs_list = []
|
||||||
for output in sp_outputs:
|
for output in sp_outputs:
|
||||||
sp_outputs_list.append(
|
sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=no_sp_outputs_list,
|
outputs_0_lst=no_sp_outputs_list,
|
||||||
|
|||||||
Reference in New Issue
Block a user