### What this PR does / why we need it?
| File Path |
| :--- |
| `tests/e2e/singlecard/compile/backend.py` |
| `tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py` |
| `tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py` |
| `tests/e2e/singlecard/compile/test_norm_quant_fusion.py` |
| `tests/e2e/singlecard/model_runner_v2/test_basic.py` |
| `tests/e2e/singlecard/test_aclgraph_accuracy.py` |
| `tests/e2e/singlecard/test_aclgraph_batch_invariant.py` |
| `tests/e2e/singlecard/test_aclgraph_mem.py` |
| `tests/e2e/singlecard/test_async_scheduling.py` |
| `tests/e2e/singlecard/test_auto_fit_max_mode_len.py` |
| `tests/e2e/singlecard/test_batch_invariant.py` |
| `tests/e2e/singlecard/test_camem.py` |
| `tests/e2e/singlecard/test_completion_with_prompt_embeds.py` |
| `tests/e2e/singlecard/test_cpu_offloading.py` |
| `tests/e2e/singlecard/test_guided_decoding.py` |
| `tests/e2e/singlecard/test_ilama_lora.py` |
| `tests/e2e/singlecard/test_llama32_lora.py` |
| `tests/e2e/singlecard/test_models.py` |
| `tests/e2e/singlecard/test_multistream_overlap_shared_expert.py` |
| `tests/e2e/singlecard/test_quantization.py` |
| `tests/e2e/singlecard/test_qwen3_multi_loras.py` |
| `tests/e2e/singlecard/test_sampler.py` |
| `tests/e2e/singlecard/test_vlm.py` |
| `tests/e2e/singlecard/test_xlite.py` |
| `tests/e2e/singlecard/utils.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -14,8 +14,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from collections.abc import Callable, Sequence
|
||||
from copy import deepcopy
|
||||
from typing import Any, Callable, List, Optional, Sequence
|
||||
from typing import Any
|
||||
|
||||
import torch.fx as fx
|
||||
from torch._inductor.decomposition import select_decomp_table
|
||||
@@ -37,7 +38,7 @@ class TestBackend:
|
||||
records the FX graph before and after the transformation.
|
||||
"""
|
||||
|
||||
def __init__(self, custom_passes: Optional[List[Any]] = None):
|
||||
def __init__(self, custom_passes: list[Any] | None = None):
|
||||
vllm_config = get_current_vllm_config()
|
||||
compile_config = vllm_config.compilation_config
|
||||
self.inductor_config = compile_config.inductor_compile_config
|
||||
@@ -48,9 +49,7 @@ class TestBackend:
|
||||
self.graph_pre_pass = None
|
||||
self.graph_post_pass = None
|
||||
|
||||
def post_pass(self,
|
||||
graph: fx.Graph,
|
||||
runtime_shape: int | None = None) -> fx.Graph:
|
||||
def post_pass(self, graph: fx.Graph, runtime_shape: int | None = None) -> fx.Graph:
|
||||
"""
|
||||
Apply custom graph transformation passes.
|
||||
"""
|
||||
@@ -62,13 +61,13 @@ class TestBackend:
|
||||
return graph
|
||||
|
||||
def compile(
|
||||
self,
|
||||
graph: fx.GraphModule,
|
||||
example_inputs: list[Any],
|
||||
compiler_config: dict[str, Any],
|
||||
runtime_shape: Optional[int] = None,
|
||||
key: Optional[str] = None
|
||||
) -> tuple[Optional[Callable], Optional[Any]]:
|
||||
self,
|
||||
graph: fx.GraphModule,
|
||||
example_inputs: list[Any],
|
||||
compiler_config: dict[str, Any],
|
||||
runtime_shape: int | None = None,
|
||||
key: str | None = None,
|
||||
) -> tuple[Callable | None, Any | None]:
|
||||
"""
|
||||
Compile the FX graph using vLLM's Ascend compiler interface.
|
||||
Wraps the post-pass logic into the inner_compile callback.
|
||||
@@ -87,8 +86,7 @@ class TestBackend:
|
||||
)
|
||||
return compiled_fn, None
|
||||
|
||||
def __call__(self, gm: fx.GraphModule,
|
||||
example_inputs: Optional[List[Any]]):
|
||||
def __call__(self, gm: fx.GraphModule, example_inputs: list[Any] | None):
|
||||
"""
|
||||
Make the backend callable by torch.compile().
|
||||
Returns a compiled executable function.
|
||||
@@ -103,17 +101,11 @@ class TestBackend:
|
||||
)
|
||||
return compiled_fn
|
||||
|
||||
def find_nodes_by_target(self, graph: fx.GraphModule,
|
||||
target: OpOverload) -> List[fx.Node]:
|
||||
def find_nodes_by_target(self, graph: fx.GraphModule, target: OpOverload) -> list[fx.Node]:
|
||||
"""Helper to find all FX nodes that call a specific operator."""
|
||||
return [
|
||||
node for node in graph.graph.nodes
|
||||
if hasattr(node, 'target') and node.target == target
|
||||
]
|
||||
return [node for node in graph.graph.nodes if hasattr(node, "target") and node.target == target]
|
||||
|
||||
def check_before_ops(self,
|
||||
ops: Sequence[OpOverload],
|
||||
fully_replaced: bool = True):
|
||||
def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced: bool = True):
|
||||
"""
|
||||
Verify that the original (unfused) operators exist before the pass
|
||||
and are fully removed afterward (if fully_replaced=True).
|
||||
|
||||
@@ -215,6 +215,7 @@ def register_pattern_safe(pattern_class, vllm_config, eps, pattern_key):
|
||||
try:
|
||||
# Import the required pass class
|
||||
from torch._inductor.pattern_matcher import PatternMatcherPass
|
||||
|
||||
pm_pass = PatternMatcherPass()
|
||||
pattern.register(pm_pass)
|
||||
_registered_patterns.add(pattern_key)
|
||||
@@ -243,7 +244,7 @@ def test_rmsnorm_quant_fusion(
|
||||
sp_enable: bool,
|
||||
):
|
||||
# Check if fusion operator is available
|
||||
if not hasattr(torch.ops.npu, 'npu_add_rms_norm_quant'):
|
||||
if not hasattr(torch.ops.npu, "npu_add_rms_norm_quant"):
|
||||
pytest.skip("Fusion operator npu_add_rms_norm_quant not available, skipping test")
|
||||
|
||||
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
|
||||
@@ -266,7 +267,7 @@ def test_rmsnorm_quant_fusion(
|
||||
if not enable_custom_op():
|
||||
pytest.skip("Custom ops not available, skipping bias test")
|
||||
# Check if the bias operator exists
|
||||
if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'):
|
||||
if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
|
||||
pytest.skip("Operator npu_add_rms_norm_bias not available, skipping bias test")
|
||||
if sp_enable:
|
||||
model = ModelSPWithBias(hidden_size, dtype, eps, device="npu")
|
||||
@@ -281,13 +282,11 @@ def test_rmsnorm_quant_fusion(
|
||||
else:
|
||||
# The non-bias patterns currently use npu_add_rms_norm_bias in their pattern matching
|
||||
# so we need to skip if it's not available
|
||||
if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'):
|
||||
if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
|
||||
pytest.skip("Operator npu_add_rms_norm_bias not available, skipping test")
|
||||
if sp_enable:
|
||||
model = ModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
|
||||
register_pattern_safe(
|
||||
AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern"
|
||||
)
|
||||
register_pattern_safe(AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern")
|
||||
else:
|
||||
model = ModelWithoutBias(hidden_size, dtype, eps, device="npu")
|
||||
register_pattern_safe(AddRMSNormQuantPattern, vllm_config, eps, "GraphEXAddRMSNormQuantPattern")
|
||||
@@ -302,5 +301,9 @@ def test_rmsnorm_quant_fusion(
|
||||
compiled_out, compiled_res = compiled_model(x)
|
||||
|
||||
# Verify output shapes are correct
|
||||
assert compiled_out.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
|
||||
assert compiled_res.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
|
||||
assert compiled_out.shape == (num_tokens, hidden_size), (
|
||||
f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
|
||||
)
|
||||
assert compiled_res.shape == (num_tokens, hidden_size), (
|
||||
f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
|
||||
)
|
||||
|
||||
@@ -201,6 +201,7 @@ def test_rmsnorm_quant_fusion(
|
||||
vllm_config=vllm_config, head_dim=head_dim, num_heads=num_heads, num_kv_heads=num_kv_heads, eps=eps
|
||||
)
|
||||
from torch._inductor.pattern_matcher import PatternMatcherPass
|
||||
|
||||
pm_pass = PatternMatcherPass()
|
||||
fusion_pattern.register(pm_pass)
|
||||
model = model.to("npu")
|
||||
|
||||
@@ -14,25 +14,20 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch_npu
|
||||
import vllm.config
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
import vllm_ascend.ops.register_custom_ops # noqa
|
||||
from tests.e2e.singlecard.compile.backend import TestBackend
|
||||
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
||||
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import \
|
||||
AddRMSNormQuantFusionPass
|
||||
from vllm_ascend.utils import enable_custom_op
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass
|
||||
from vllm_ascend.utils import enable_custom_op, vllm_version_is
|
||||
|
||||
if vllm_version_is("0.15.0"):
|
||||
from vllm.compilation.fx_utils import OpOverload # type: ignore
|
||||
@@ -48,34 +43,24 @@ def get_or_create_backend(vllm_config):
|
||||
"""Get or create backend with fusion passes (cached to avoid duplicate pattern registration)."""
|
||||
global _backend_cache
|
||||
if _backend_cache is None:
|
||||
_backend_cache = TestBackend(custom_passes=[
|
||||
AddRMSNormQuantFusionPass(vllm_config=vllm_config)
|
||||
])
|
||||
_backend_cache = TestBackend(custom_passes=[AddRMSNormQuantFusionPass(vllm_config=vllm_config)])
|
||||
return _backend_cache
|
||||
|
||||
|
||||
class TestModelWithoutBias(nn.Module):
|
||||
"""
|
||||
A minimal test model that simulates the pattern:
|
||||
AddRMSNorm → Quantization (without bias)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
eps: float = 1e-6,
|
||||
device="npu"):
|
||||
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.rms_norm_weight = nn.Parameter(
|
||||
torch.randn(hidden_size, device=device))
|
||||
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
|
||||
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
@@ -87,23 +72,20 @@ class TestModelWithoutBias(nn.Module):
|
||||
residual = torch.zeros_like(x)
|
||||
|
||||
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
|
||||
x, residual, self.rms_norm_weight, None, self.eps)
|
||||
x, residual, self.rms_norm_weight, None, self.eps
|
||||
)
|
||||
|
||||
quantized_output = torch.ops.vllm.quantize(norm_output,
|
||||
self.quant_scale,
|
||||
self.quant_scale_reciprocal,
|
||||
self.quant_offset)
|
||||
quantized_output = torch.ops.vllm.quantize(
|
||||
norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
|
||||
)
|
||||
|
||||
return quantized_output, new_residual
|
||||
|
||||
def ops_in_model_before(self) -> List[OpOverload]:
|
||||
def ops_in_model_before(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators BEFORE fusion."""
|
||||
return [
|
||||
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
|
||||
torch.ops.vllm.quantize.default
|
||||
]
|
||||
return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
|
||||
|
||||
def ops_in_model_after(self) -> List[OpOverload]:
|
||||
def ops_in_model_after(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators AFTER successful fusion."""
|
||||
return [torch.ops.npu.npu_add_rms_norm_quant.default]
|
||||
|
||||
@@ -114,24 +96,15 @@ class TestModelWithBias(nn.Module):
|
||||
AddRMSNorm → Add Bias → Quantization (with bias)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
eps: float = 1e-6,
|
||||
device="npu"):
|
||||
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.rms_norm_weight = nn.Parameter(
|
||||
torch.randn(hidden_size, device=device))
|
||||
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
|
||||
self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
|
||||
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
@@ -144,23 +117,20 @@ class TestModelWithBias(nn.Module):
|
||||
residual = torch.zeros_like(x)
|
||||
|
||||
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
|
||||
x, residual, self.rms_norm_weight, self.bias, self.eps)
|
||||
x, residual, self.rms_norm_weight, self.bias, self.eps
|
||||
)
|
||||
|
||||
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
|
||||
self.quant_scale,
|
||||
self.quant_scale_reciprocal,
|
||||
self.quant_offset)
|
||||
quantized_output = torch.ops.vllm.quantize(
|
||||
norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
|
||||
)
|
||||
|
||||
return quantized_output, new_residual
|
||||
|
||||
def ops_in_model_before(self) -> List[OpOverload]:
|
||||
def ops_in_model_before(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators BEFORE fusion."""
|
||||
return [
|
||||
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
|
||||
torch.ops.vllm.quantize.default
|
||||
]
|
||||
return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
|
||||
|
||||
def ops_in_model_after(self) -> List[OpOverload]:
|
||||
def ops_in_model_after(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators AFTER successful fusion."""
|
||||
return [torch.ops.npu.npu_add_rms_norm_quant.default]
|
||||
|
||||
@@ -171,23 +141,14 @@ class TestModelSPWithoutBias(nn.Module):
|
||||
AddRMSNorm → maybe_allgather → Quantization (without bias)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
eps: float = 1e-6,
|
||||
device="npu"):
|
||||
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.rms_norm_weight = nn.Parameter(
|
||||
torch.randn(hidden_size, device=device))
|
||||
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
|
||||
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
@@ -200,32 +161,28 @@ class TestModelSPWithoutBias(nn.Module):
|
||||
residual = torch.zeros_like(x)
|
||||
|
||||
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
|
||||
x, residual, self.rms_norm_weight, None, self.eps)
|
||||
x, residual, self.rms_norm_weight, None, self.eps
|
||||
)
|
||||
|
||||
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
|
||||
norm_output, True)
|
||||
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output, True)
|
||||
|
||||
quantized_output = torch.ops.vllm.quantize(norm_output,
|
||||
self.quant_scale,
|
||||
self.quant_scale_reciprocal,
|
||||
self.quant_offset)
|
||||
quantized_output = torch.ops.vllm.quantize(
|
||||
norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
|
||||
)
|
||||
|
||||
return quantized_output, new_residual
|
||||
|
||||
def ops_in_model_before(self) -> List[OpOverload]:
|
||||
def ops_in_model_before(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators BEFORE fusion."""
|
||||
return [
|
||||
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
|
||||
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
|
||||
torch.ops.vllm.quantize.default
|
||||
torch.ops.vllm.quantize.default,
|
||||
]
|
||||
|
||||
def ops_in_model_after(self) -> List[OpOverload]:
|
||||
def ops_in_model_after(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators AFTER successful fusion."""
|
||||
return [
|
||||
torch.ops.npu.npu_add_rms_norm_quant.default,
|
||||
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
|
||||
]
|
||||
return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
|
||||
|
||||
|
||||
class TestModelSPWithBias(nn.Module):
|
||||
@@ -234,24 +191,15 @@ class TestModelSPWithBias(nn.Module):
|
||||
AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
eps: float = 1e-6,
|
||||
device="npu"):
|
||||
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.rms_norm_weight = nn.Parameter(
|
||||
torch.randn(hidden_size, device=device))
|
||||
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
|
||||
self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
|
||||
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
|
||||
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
@@ -265,32 +213,28 @@ class TestModelSPWithBias(nn.Module):
|
||||
residual = torch.zeros_like(x)
|
||||
|
||||
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
|
||||
x, residual, self.rms_norm_weight, self.bias, self.eps)
|
||||
x, residual, self.rms_norm_weight, self.bias, self.eps
|
||||
)
|
||||
|
||||
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
|
||||
norm_output_with_bias, True)
|
||||
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output_with_bias, True)
|
||||
|
||||
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
|
||||
self.quant_scale,
|
||||
self.quant_scale_reciprocal,
|
||||
self.quant_offset)
|
||||
quantized_output = torch.ops.vllm.quantize(
|
||||
norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
|
||||
)
|
||||
|
||||
return quantized_output, new_residual
|
||||
|
||||
def ops_in_model_before(self) -> List[OpOverload]:
|
||||
def ops_in_model_before(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators BEFORE fusion."""
|
||||
return [
|
||||
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
|
||||
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
|
||||
torch.ops.vllm.quantize.default
|
||||
torch.ops.vllm.quantize.default,
|
||||
]
|
||||
|
||||
def ops_in_model_after(self) -> List[OpOverload]:
|
||||
def ops_in_model_after(self) -> list[OpOverload]:
|
||||
"""Return the list of expected operators AFTER successful fusion."""
|
||||
return [
|
||||
torch.ops.npu.npu_add_rms_norm_quant.default,
|
||||
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
|
||||
]
|
||||
return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@@ -317,58 +261,42 @@ def test_rmsnorm_quant_fusion(
|
||||
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
|
||||
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
update_environment_variables({
|
||||
"RANK": "0",
|
||||
"LOCAL_RANK": "0",
|
||||
"WORLD_SIZE": "1",
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
})
|
||||
update_environment_variables(
|
||||
{
|
||||
"RANK": "0",
|
||||
"LOCAL_RANK": "0",
|
||||
"WORLD_SIZE": "1",
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
}
|
||||
)
|
||||
init_distributed_environment()
|
||||
ensure_model_parallel_initialized(1, 1)
|
||||
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
with set_ascend_forward_context(None, vllm_config):
|
||||
backend = get_or_create_backend(vllm_config)
|
||||
if use_bias:
|
||||
if not enable_custom_op():
|
||||
return
|
||||
if sp_enable:
|
||||
model = TestModelSPWithBias(hidden_size,
|
||||
dtype,
|
||||
eps,
|
||||
device="npu")
|
||||
else:
|
||||
model = TestModelWithBias(hidden_size,
|
||||
dtype,
|
||||
eps,
|
||||
device="npu")
|
||||
with vllm.config.set_current_vllm_config(vllm_config), set_ascend_forward_context(None, vllm_config):
|
||||
backend = get_or_create_backend(vllm_config)
|
||||
if use_bias:
|
||||
if not enable_custom_op():
|
||||
return
|
||||
if sp_enable:
|
||||
model = TestModelSPWithBias(hidden_size, dtype, eps, device="npu")
|
||||
else:
|
||||
if sp_enable:
|
||||
model = TestModelSPWithoutBias(hidden_size,
|
||||
dtype,
|
||||
eps,
|
||||
device="npu")
|
||||
else:
|
||||
model = TestModelWithoutBias(hidden_size,
|
||||
dtype,
|
||||
eps,
|
||||
device="npu")
|
||||
model = model.to("npu")
|
||||
model = TestModelWithBias(hidden_size, dtype, eps, device="npu")
|
||||
else:
|
||||
if sp_enable:
|
||||
model = TestModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
|
||||
else:
|
||||
model = TestModelWithoutBias(hidden_size, dtype, eps, device="npu")
|
||||
model = model.to("npu")
|
||||
|
||||
x = torch.rand(num_tokens,
|
||||
hidden_size,
|
||||
device="npu",
|
||||
dtype=dtype,
|
||||
requires_grad=False)
|
||||
x = torch.rand(num_tokens, hidden_size, device="npu", dtype=dtype, requires_grad=False)
|
||||
|
||||
result_unfused = model(x)
|
||||
print("Unfused result:", [t.shape for t in result_unfused])
|
||||
model_fused = torch.compile(model, backend=backend)
|
||||
result_fused = model_fused(x)
|
||||
print("Fused result:", [t.shape for t in result_fused])
|
||||
result_unfused = model(x)
|
||||
print("Unfused result:", [t.shape for t in result_unfused])
|
||||
model_fused = torch.compile(model, backend=backend)
|
||||
result_fused = model_fused(x)
|
||||
print("Fused result:", [t.shape for t in result_fused])
|
||||
|
||||
print("=== Checking operator fusion ===")
|
||||
backend.check_before_ops(model.ops_in_model_before(),
|
||||
fully_replaced=not sp_enable)
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
print("=== Checking operator fusion ===")
|
||||
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=not sp_enable)
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
|
||||
@@ -47,9 +47,9 @@ def test_qwen3_dense_eager_mode(
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=enforce_eager,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=enforce_eager,
|
||||
) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
@@ -74,14 +74,14 @@ def test_egale_spec_decoding(
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=enforce_eager,
|
||||
async_scheduling=True,
|
||||
speculative_config={
|
||||
"model": eagle_model,
|
||||
"method": "eagle",
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=enforce_eager,
|
||||
async_scheduling=True,
|
||||
speculative_config={
|
||||
"model": eagle_model,
|
||||
"method": "eagle",
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
@@ -15,20 +15,22 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import pytest
|
||||
# ruff: noqa: E501
|
||||
|
||||
import os
|
||||
|
||||
from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
|
||||
LLMTestCase, gen_and_valid)
|
||||
import pytest
|
||||
|
||||
from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
|
||||
|
||||
CASE_QWEN_ACLGRAPH = LLMTestCase(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
prompts=PROMPTS_SHORT,
|
||||
golden_answers=[
|
||||
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
|
||||
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
|
||||
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
|
||||
' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
|
||||
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
|
||||
" Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
|
||||
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
|
||||
],
|
||||
)
|
||||
|
||||
@@ -37,10 +39,10 @@ CASE_DS_ACLGRAPH = LLMTestCase(
|
||||
quantization="ascend",
|
||||
prompts=PROMPTS_SHORT,
|
||||
golden_answers=[
|
||||
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
|
||||
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
|
||||
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
|
||||
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
|
||||
"\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
|
||||
" a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
|
||||
" Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
|
||||
" here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
|
||||
],
|
||||
)
|
||||
|
||||
@@ -49,9 +51,9 @@ CASE_QWEN_FULL = LLMTestCase(
|
||||
prompts=PROMPTS_SHORT,
|
||||
golden_answers=[
|
||||
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
|
||||
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
|
||||
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
|
||||
' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
|
||||
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
|
||||
" Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
|
||||
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
|
||||
],
|
||||
)
|
||||
|
||||
@@ -60,10 +62,10 @@ CASE_DS_FULL = LLMTestCase(
|
||||
quantization="ascend",
|
||||
prompts=PROMPTS_SHORT,
|
||||
golden_answers=[
|
||||
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
|
||||
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
|
||||
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
|
||||
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
|
||||
"\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
|
||||
" a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
|
||||
" Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
|
||||
" here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
|
||||
],
|
||||
)
|
||||
|
||||
@@ -71,10 +73,11 @@ CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
prompts=PROMPTS_LONG,
|
||||
golden_answers=[
|
||||
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
|
||||
" \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
|
||||
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
|
||||
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
|
||||
])
|
||||
" \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
|
||||
],
|
||||
)
|
||||
|
||||
CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
|
||||
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
@@ -83,26 +86,31 @@ CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
|
||||
golden_answers=[
|
||||
"\n\nSelect an assignment template",
|
||||
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
|
||||
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
|
||||
])
|
||||
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
|
||||
],
|
||||
)
|
||||
|
||||
CASE_QWEN_EX = LLMTestCase(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
prompts=PROMPTS_LONG,
|
||||
golden_answers=[
|
||||
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
|
||||
" \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
|
||||
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
|
||||
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
|
||||
])
|
||||
" \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
|
||||
],
|
||||
)
|
||||
|
||||
CASE_DS_EX = LLMTestCase(
|
||||
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
quantization="ascend",
|
||||
prompts=PROMPTS_LONG,
|
||||
golden_answers=[
|
||||
"\n\nSelect an assignment template",
|
||||
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
|
||||
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
|
||||
],
|
||||
)
|
||||
|
||||
CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
quantization="ascend",
|
||||
prompts=PROMPTS_LONG,
|
||||
golden_answers=[
|
||||
"\n\nSelect an assignment template",
|
||||
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
|
||||
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
|
||||
])
|
||||
|
||||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
|
||||
def test_piecewise_res_consistency(cur_case: LLMTestCase):
|
||||
@@ -112,51 +120,48 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
|
||||
"cudagraph_capture_sizes": [1, 2, 4, 8],
|
||||
"quantization": cur_case.quantization,
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
|
||||
|
||||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
|
||||
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||||
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||||
runner_kwargs = {
|
||||
"model_name": cur_case.model,
|
||||
"max_model_len": 1024,
|
||||
"compilation_config": {
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
"quantization": cur_case.quantization,
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
|
||||
|
||||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
|
||||
def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||||
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||||
runner_kwargs = {
|
||||
"model_name": cur_case.model,
|
||||
"max_model_len": 1024,
|
||||
"compilation_config": {
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
"quantization": cur_case.quantization,
|
||||
"additional_config": {
|
||||
"npugraph_ex_config": {
|
||||
"enable": False
|
||||
}
|
||||
},
|
||||
"additional_config": {"npugraph_ex_config": {"enable": False}},
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
|
||||
def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||||
@@ -165,20 +170,16 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||||
"model_name": cur_case.model,
|
||||
"quantization": cur_case.quantization,
|
||||
"max_model_len": 1024,
|
||||
"compilation_config": {
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
"additional_config": {
|
||||
"npugraph_ex_config": {
|
||||
"enable": True
|
||||
}
|
||||
},
|
||||
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
"additional_config": {"npugraph_ex_config": {"enable": True}},
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
|
||||
# The accuracy has already been verified in the previous test case.
|
||||
# This test case is used to check whether the functionality works properly
|
||||
@@ -190,10 +191,7 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
|
||||
"model_name": cur_case.model,
|
||||
"quantization": cur_case.quantization,
|
||||
"max_model_len": 1024,
|
||||
"compilation_config": {
|
||||
"cudagraph_capture_sizes": [4, 8],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
"compilation_config": {"cudagraph_capture_sizes": [4, 8], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
"additional_config": {
|
||||
"npugraph_ex_config": {
|
||||
"enable": True,
|
||||
@@ -201,12 +199,14 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
|
||||
}
|
||||
},
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
# Check whether the static kernel is properly uninstall
|
||||
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
|
||||
static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core')
|
||||
static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
|
||||
assert not os.path.exists(static_kernel_install_path)
|
||||
|
||||
@@ -22,6 +22,7 @@ import random
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
|
||||
@@ -69,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
|
||||
if target_words > 50:
|
||||
# For longer prompts, repeat context
|
||||
padding_text = (
|
||||
" This is an interesting topic that deserves more explanation. " *
|
||||
(target_words // 50))
|
||||
padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
|
||||
base_prompt = base_prompt + padding_text
|
||||
|
||||
return base_prompt
|
||||
@@ -107,8 +106,7 @@ def _extract_step_logprobs(generate_output):
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Ensures that the same request (the 'needle' prompt) yields identical output
|
||||
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
|
||||
@@ -162,20 +160,16 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
needle_prompt = "There once was a "
|
||||
|
||||
with VllmRunner(
|
||||
model_name=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
dtype="bfloat16",
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
enable_prefix_caching=False,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [1, 32, 64]
|
||||
}
|
||||
model_name=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
dtype="bfloat16",
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
enable_prefix_caching=False,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
|
||||
) as vllm_model:
|
||||
|
||||
# Baseline generation for the needle prompt alone.
|
||||
baseline_out = vllm_model.generate([needle_prompt], sampling)
|
||||
assert len(baseline_out) == 1
|
||||
@@ -194,8 +188,7 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(
|
||||
_random_prompt(min_random_prompt, max_random_prompt))
|
||||
prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
|
||||
|
||||
# Generate with the larger-batch engine
|
||||
outputs = vllm_model.generate(prompts, sampling)
|
||||
@@ -204,24 +197,23 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
text = needle_output[0]
|
||||
|
||||
if text != baseline_text:
|
||||
print(
|
||||
f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
|
||||
print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
|
||||
mismatches += 1
|
||||
|
||||
passes = num_trials - mismatches
|
||||
# Dump how many passed vs failed
|
||||
print(f"[determinism] total={num_trials}, passed={passes}, "
|
||||
f"failed={mismatches}, max_batch_size={max_batch_size}")
|
||||
print(
|
||||
f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
|
||||
)
|
||||
|
||||
if mismatches > 0:
|
||||
pytest.fail(
|
||||
f"Nondeterministic outputs detected: {mismatches} failed out "
|
||||
f"of {num_trials} trials (max_batch_size={max_batch_size}).")
|
||||
f"of {num_trials} trials (max_batch_size={max_batch_size})."
|
||||
)
|
||||
|
||||
|
||||
|
||||
def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
model_name = DEFAULT_MODEL
|
||||
@@ -235,24 +227,19 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
|
||||
if disable_custom_ar:
|
||||
print(f"\n{'=' * 80}")
|
||||
print(
|
||||
f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
|
||||
)
|
||||
print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
with VllmRunner(
|
||||
model_name=model_name,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [1, 32, 64]
|
||||
}
|
||||
model_name=model_name,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
|
||||
) as vllm_model:
|
||||
# Use more realistic prompts for better token generation
|
||||
prompts = [_random_prompt(10, 50) for i in range(32)]
|
||||
@@ -273,16 +260,13 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
bs1_logprobs_per_prompt = []
|
||||
bs1_tokens_per_prompt = []
|
||||
for idx, p in enumerate(prompts):
|
||||
print(
|
||||
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
|
||||
)
|
||||
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
|
||||
outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
|
||||
assert len(outs) == 1
|
||||
# print(outs)
|
||||
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bs1_logprobs_per_prompt.append(step_logprobs)
|
||||
bs1_tokens_per_prompt.append(token_ids)
|
||||
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
|
||||
@@ -304,108 +288,91 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
|
||||
step_logprobs, token_ids = _extract_step_logprobs(o)
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bsN_logprobs_per_prompt.append(step_logprobs)
|
||||
bsN_tokens_per_prompt.append(token_ids)
|
||||
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
|
||||
failed_prompts = []
|
||||
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)):
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)
|
||||
):
|
||||
if len(logprobs_bs1) != len(logprobs_bsN):
|
||||
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
|
||||
f"vs {len(logprobs_bsN)} (BS=N)")
|
||||
failed_prompts.append({
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if tokens match first
|
||||
if tokens_bs1 != tokens_bsN:
|
||||
failed_prompts.append({
|
||||
"prompt_idx":
|
||||
i,
|
||||
"step":
|
||||
"sampling",
|
||||
"reason":
|
||||
"Different tokens sampled",
|
||||
"prompt_preview":
|
||||
prompts[i][:100],
|
||||
"bs1_tokens":
|
||||
tokens_bs1,
|
||||
"bsN_tokens":
|
||||
tokens_bsN,
|
||||
"bs1_all_logprobs":
|
||||
[logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
|
||||
"bsN_all_logprobs":
|
||||
[logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
|
||||
})
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "sampling",
|
||||
"reason": "Different tokens sampled",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
|
||||
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
|
||||
if a.shape != b.shape:
|
||||
failed_prompts.append({
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
if not torch.equal(a, b):
|
||||
max_diff = torch.abs(a - b).max().item()
|
||||
# Print which token failed
|
||||
print(
|
||||
f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
|
||||
)
|
||||
print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
|
||||
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
|
||||
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
|
||||
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
|
||||
print(f" BS=1 logprob: {a.tolist()}")
|
||||
print(f" BS=N logprob: {b.tolist()}")
|
||||
failed_prompts.append({
|
||||
"prompt_idx":
|
||||
i,
|
||||
"step":
|
||||
t,
|
||||
"reason":
|
||||
f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview":
|
||||
prompts[i][:100],
|
||||
"bs1_tokens":
|
||||
tokens_bs1,
|
||||
"bsN_tokens":
|
||||
tokens_bsN,
|
||||
"bs1_all_logprobs": [
|
||||
logprobs_bs1[s].tolist()
|
||||
for s in range(len(logprobs_bs1))
|
||||
],
|
||||
"bsN_all_logprobs": [
|
||||
logprobs_bsN[s].tolist()
|
||||
for s in range(len(logprobs_bsN))
|
||||
],
|
||||
})
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
|
||||
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
|
||||
# Print summary of all failures
|
||||
if failed_prompts:
|
||||
print(f"\n{'=' * 80}")
|
||||
fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
|
||||
f"{len(prompts)} prompts failed")
|
||||
fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
|
||||
print(fail_msg)
|
||||
print(f"{'=' * 80}")
|
||||
for fail in failed_prompts:
|
||||
@@ -420,21 +387,18 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
print(f" BS=N tokens: {fail['bsN_tokens']}")
|
||||
|
||||
if "bs1_all_logprobs" in fail:
|
||||
print(
|
||||
f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
|
||||
)
|
||||
print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
|
||||
for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
|
||||
print(f" Step {step_idx}: {logprobs}")
|
||||
print(
|
||||
f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
|
||||
)
|
||||
print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
|
||||
for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
|
||||
print(f" Step {step_idx}: {logprobs}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
# Fail the test with summary
|
||||
msg = (f"Batch invariance violated in {len(failed_prompts)}/"
|
||||
f"{len(prompts)} prompts. See output above for details.")
|
||||
msg = (
|
||||
f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
|
||||
)
|
||||
pytest.fail(msg)
|
||||
|
||||
|
||||
@@ -446,18 +410,15 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
|
||||
model = DEFAULT_MODEL
|
||||
|
||||
with VllmRunner(
|
||||
model_name=model,
|
||||
max_num_seqs=1,
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
gpu_memory_utilization=0.9,
|
||||
max_model_len=2048,
|
||||
dtype="float16",
|
||||
enable_prefix_caching=False,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [1, 32, 64]
|
||||
},
|
||||
distributed_executor_backend="mp",
|
||||
model_name=model,
|
||||
max_num_seqs=1,
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
gpu_memory_utilization=0.9,
|
||||
max_model_len=2048,
|
||||
dtype="float16",
|
||||
enable_prefix_caching=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
|
||||
distributed_executor_backend="mp",
|
||||
) as vllm_model:
|
||||
prompt = "The capital of France is"
|
||||
sampling_params = SamplingParams(
|
||||
@@ -479,11 +440,7 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def test_aclgraph_logprobs_without_batch_invariance_should_fail(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
def test_aclgraph_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
|
||||
It DISABLES batch invariance mode and expects to see non-deterministic behavior
|
||||
@@ -505,19 +462,15 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
with VllmRunner(
|
||||
model_name=model_name,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [1, 32, 64]
|
||||
},
|
||||
distributed_executor_backend="mp",
|
||||
model_name=model_name,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
|
||||
distributed_executor_backend="mp",
|
||||
) as vllm_model:
|
||||
|
||||
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
|
||||
long_min = int(os.getenv("VLLM_MIN_PROMPT", "768"))
|
||||
long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
|
||||
@@ -549,16 +502,13 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
|
||||
bs1_logprobs_per_prompt = []
|
||||
bs1_tokens_per_prompt = []
|
||||
for idx, p in enumerate(prompts):
|
||||
print(
|
||||
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
|
||||
)
|
||||
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
|
||||
outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
|
||||
|
||||
assert len(outs) == 1
|
||||
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bs1_logprobs_per_prompt.append(step_logprobs)
|
||||
bs1_tokens_per_prompt.append(token_ids)
|
||||
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
|
||||
@@ -579,84 +529,90 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
|
||||
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
|
||||
step_logprobs, token_ids = _extract_step_logprobs(o)
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bsN_logprobs_per_prompt.append(step_logprobs)
|
||||
bsN_tokens_per_prompt.append(token_ids)
|
||||
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
|
||||
differences_found = []
|
||||
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)):
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)
|
||||
):
|
||||
if len(logprobs_bs1) != len(logprobs_bsN):
|
||||
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
|
||||
f"vs {len(logprobs_bsN)} (BS=N)")
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if tokens match first
|
||||
if tokens_bs1 != tokens_bsN:
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": "sampling",
|
||||
"reason": "Different tokens sampled",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "sampling",
|
||||
"reason": "Different tokens sampled",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
|
||||
if a.shape != b.shape:
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
if not torch.equal(a, b):
|
||||
max_diff = torch.abs(a - b).max().item()
|
||||
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
|
||||
f"Token {t}: max_diff={max_diff:.6e}")
|
||||
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
|
||||
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
|
||||
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
|
||||
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
|
||||
print(f" BS=1 logprob: {a.tolist()}")
|
||||
print(f" BS=N logprob: {b.tolist()}")
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'=' * 80}")
|
||||
if differences_found:
|
||||
success_msg = (
|
||||
f"✓ SUCCESS: Batch invariance is doing something! "
|
||||
f"Found {len(differences_found)}/{len(prompts)} prompts "
|
||||
f"with differences when batch invariance was DISABLED.")
|
||||
f"with differences when batch invariance was DISABLED."
|
||||
)
|
||||
print(success_msg)
|
||||
print(f"{'=' * 80}")
|
||||
for diff in differences_found:
|
||||
@@ -676,7 +632,8 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
|
||||
f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
|
||||
f"between BS=1 and BS=N even with batch invariance DISABLED. "
|
||||
f"This suggests batch invariance might not be necessary, "
|
||||
f"or the test needs more sensitive prompts.")
|
||||
f"or the test needs more sensitive prompts."
|
||||
)
|
||||
print(fail_msg)
|
||||
print(f"{'=' * 80}\n")
|
||||
pytest.fail(fail_msg)
|
||||
|
||||
@@ -40,7 +40,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
|
||||
capture_mem_after = multiprocessing.Value("q", -1) # long long
|
||||
|
||||
def capture_model_wrapper(original_method):
|
||||
|
||||
def wrapped(self):
|
||||
mem_before = torch.npu.mem_get_info()[0] # free memory
|
||||
result = original_method(self)
|
||||
@@ -55,19 +54,16 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
|
||||
|
||||
original_capture = NPUModelRunner.capture_model
|
||||
|
||||
with patch.object(NPUModelRunner,
|
||||
'capture_model',
|
||||
new=capture_model_wrapper(original_capture)):
|
||||
with patch.object(NPUModelRunner, "capture_model", new=capture_model_wrapper(original_capture)):
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
temperature=0.0)
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||
vllm_model = VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
quantization="ascend")
|
||||
vllm_model = VllmRunner(model, max_model_len=1024, quantization="ascend")
|
||||
else:
|
||||
vllm_model = VllmRunner(model)
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
@@ -94,5 +90,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
|
||||
assert mem_used_by_capture < max_mem_expected, (
|
||||
f"capture_model used more memory than expected. "
|
||||
f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
|
||||
f"Expected: < {max_capture_mem_gib:.2f} GiB")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
|
||||
f"Expected: < {max_capture_mem_gib:.2f} GiB"
|
||||
)
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@@ -15,8 +15,7 @@ from tests.e2e.model_utils import check_outputs_equal
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16"
|
||||
|
||||
first_prompt = ("The following numbers of the sequence " +
|
||||
", ".join(str(i) for i in range(10)) + " are:")
|
||||
first_prompt = "The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:"
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
@@ -31,7 +30,9 @@ default_params = dict(
|
||||
)
|
||||
|
||||
|
||||
def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ):
|
||||
def test_without_spec_decoding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test consistency of combos of async scheduling, preemption,
|
||||
uni/multiproc executor, prefill chunking."""
|
||||
test_sampling_params: list[dict[str, Any]] = [
|
||||
@@ -85,11 +86,11 @@ def run_tests(
|
||||
# avoid precision errors
|
||||
outputs: list[tuple[str, list, list]] = []
|
||||
for n, (
|
||||
test_preemption,
|
||||
executor,
|
||||
async_scheduling,
|
||||
spec_config,
|
||||
test_prefill_chunking,
|
||||
test_preemption,
|
||||
executor,
|
||||
async_scheduling,
|
||||
spec_config,
|
||||
test_prefill_chunking,
|
||||
) in enumerate(test_configs, 1):
|
||||
test_str = f"{n}/{len(test_configs)}"
|
||||
test_results = run_test(
|
||||
@@ -105,21 +106,18 @@ def run_tests(
|
||||
outputs.append(test_results)
|
||||
|
||||
baseline_config, baseline_tests, _ = outputs[0]
|
||||
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None),
|
||||
(None, None, None))
|
||||
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), (None, None, None))
|
||||
|
||||
print(
|
||||
f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}"
|
||||
)
|
||||
print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
|
||||
|
||||
failure = None
|
||||
for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
|
||||
for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip(
|
||||
baseline_tests,
|
||||
baseline_acceptances or repeat(None),
|
||||
test_outputs,
|
||||
test_acceptance_rates or repeat(None),
|
||||
test_sampling_params,
|
||||
baseline_tests,
|
||||
baseline_acceptances or repeat(None),
|
||||
test_outputs,
|
||||
test_acceptance_rates or repeat(None),
|
||||
test_sampling_params,
|
||||
):
|
||||
try:
|
||||
check_outputs_equal(
|
||||
@@ -129,21 +127,18 @@ def run_tests(
|
||||
name_1=f"config=[{test_config}], params={params}",
|
||||
)
|
||||
|
||||
if (base_acceptance_rate is not None
|
||||
and test_acceptance_rate is not None):
|
||||
if base_acceptance_rate is not None and test_acceptance_rate is not None:
|
||||
if "spec_mml=None" in test_config:
|
||||
assert (test_acceptance_rate > base_acceptance_rate
|
||||
or test_acceptance_rate == pytest.approx(
|
||||
base_acceptance_rate, rel=5e-2))
|
||||
assert test_acceptance_rate > base_acceptance_rate or test_acceptance_rate == pytest.approx(
|
||||
base_acceptance_rate, rel=5e-2
|
||||
)
|
||||
else:
|
||||
# Currently the reported acceptance rate is expected to be
|
||||
# lower when we sometimes skip drafting altogether.
|
||||
assert test_acceptance_rate > 0.1
|
||||
print(f"PASSED: config=[{test_config}], params={params}"
|
||||
f" accept_rate={test_acceptance_rate}")
|
||||
print(f"PASSED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
|
||||
except AssertionError as e:
|
||||
print(f"FAILED: config=[{test_config}], params={params}"
|
||||
f" accept_rate={test_acceptance_rate}")
|
||||
print(f"FAILED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
|
||||
if failure is None:
|
||||
failure = e
|
||||
|
||||
@@ -161,33 +156,35 @@ def run_test(
|
||||
spec_config: dict[str, Any] | None,
|
||||
test_prefill_chunking: bool,
|
||||
):
|
||||
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
spec_decoding = spec_config is not None
|
||||
cache_arg: dict[str, Any] = (
|
||||
# Force preemptions
|
||||
dict(num_gpu_blocks_override=2) if test_preemption else dict(
|
||||
gpu_memory_utilization=0.9))
|
||||
dict(num_gpu_blocks_override=2) if test_preemption else dict(gpu_memory_utilization=0.9)
|
||||
)
|
||||
spec_mml = (spec_config or {}).get("max_model_len")
|
||||
test_config = (f"executor={executor}, preemption={test_preemption}, "
|
||||
f"async_sched={async_scheduling}, "
|
||||
f"chunk_prefill={test_prefill_chunking}, "
|
||||
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}")
|
||||
test_config = (
|
||||
f"executor={executor}, preemption={test_preemption}, "
|
||||
f"async_sched={async_scheduling}, "
|
||||
f"chunk_prefill={test_prefill_chunking}, "
|
||||
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
|
||||
)
|
||||
print("-" * 80)
|
||||
print(f"---- TESTING {test_str}: {test_config}")
|
||||
print("-" * 80)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
enable_chunked_prefill=test_prefill_chunking,
|
||||
# Force prefill chunking
|
||||
max_num_batched_tokens=48 if test_prefill_chunking else None,
|
||||
enforce_eager=True,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=executor,
|
||||
dtype="float16", # avoid precision errors
|
||||
speculative_config=spec_config,
|
||||
disable_log_stats=False,
|
||||
**cache_arg,
|
||||
model,
|
||||
max_model_len=512,
|
||||
enable_chunked_prefill=test_prefill_chunking,
|
||||
# Force prefill chunking
|
||||
max_num_batched_tokens=48 if test_prefill_chunking else None,
|
||||
enforce_eager=True,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=executor,
|
||||
dtype="float16", # avoid precision errors
|
||||
speculative_config=spec_config,
|
||||
disable_log_stats=False,
|
||||
**cache_arg,
|
||||
) as vllm_model:
|
||||
results = []
|
||||
acceptance_rates: list[float] | None = [] if spec_decoding else None
|
||||
@@ -197,26 +194,23 @@ def run_test(
|
||||
results.append(
|
||||
vllm_model.generate(
|
||||
example_prompts,
|
||||
sampling_params=SamplingParams(**default_params,
|
||||
**override_params),
|
||||
))
|
||||
sampling_params=SamplingParams(**default_params, **override_params),
|
||||
)
|
||||
)
|
||||
metrics_after = vllm_model.model.get_metrics()
|
||||
if acceptance_rates is not None:
|
||||
acceptance_rate = _get_acceptance_rate(metrics_before,
|
||||
metrics_after)
|
||||
acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
|
||||
acceptance_rates.append(acceptance_rate)
|
||||
print(f"ACCEPTANCE RATE {acceptance_rate}")
|
||||
|
||||
if test_preemption:
|
||||
preemptions = _get_count(metrics_before, metrics_after,
|
||||
"vllm:num_preemptions")
|
||||
preemptions = _get_count(metrics_before, metrics_after, "vllm:num_preemptions")
|
||||
assert preemptions > 0, "preemption test had no preemptions"
|
||||
|
||||
if len(results) > 1:
|
||||
# First check that the different parameter configs
|
||||
# actually result in different output.
|
||||
for other_test_outs, params in zip(results[1:],
|
||||
sampling_param_tests[1:]):
|
||||
for other_test_outs, params in zip(results[1:], sampling_param_tests[1:]):
|
||||
with pytest.raises(AssertionError):
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=results[0][0],
|
||||
|
||||
@@ -42,6 +42,7 @@ def new_kv_cache_spec(
|
||||
attention_chunk_size=attention_chunk_size,
|
||||
)
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len():
|
||||
"""Test that max_model_len=-1 auto-fits to available NPU memory."""
|
||||
# Create config with original_max_model_len=-1 to trigger auto-fit
|
||||
@@ -59,9 +60,7 @@ def test_auto_fit_max_model_len():
|
||||
|
||||
# With enough memory, max_model_len stays at the derived max
|
||||
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [large_available_memory]
|
||||
)
|
||||
_kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory])
|
||||
assert vllm_config.model_config.max_model_len == 1024
|
||||
|
||||
# Reset for next test
|
||||
@@ -73,9 +72,7 @@ def test_auto_fit_max_model_len():
|
||||
# Need memory for at least max_model_len tokens
|
||||
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
|
||||
limited_memory = mem_per_block_per_layer * 2 * 32
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [limited_memory]
|
||||
)
|
||||
_kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [limited_memory])
|
||||
# Should be reduced to fit in memory
|
||||
assert vllm_config.model_config.max_model_len < 1024
|
||||
assert vllm_config.model_config.max_model_len > 0
|
||||
@@ -94,7 +91,5 @@ def test_auto_fit_max_model_len_not_triggered():
|
||||
}
|
||||
|
||||
# This should work normally without auto-fit
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
|
||||
)
|
||||
_kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32])
|
||||
assert vllm_config.model_config.max_model_len == 16
|
||||
|
||||
@@ -70,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
|
||||
if target_words > 50:
|
||||
# For longer prompts, repeat context
|
||||
padding_text = (
|
||||
" This is an interesting topic that deserves more explanation. " *
|
||||
(target_words // 50))
|
||||
padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
|
||||
base_prompt = base_prompt + padding_text
|
||||
|
||||
return base_prompt
|
||||
@@ -83,10 +81,7 @@ def _extract_step_logprobs(request_output):
|
||||
inner = request_output.outputs[0]
|
||||
if hasattr(inner, "logprobs") and inner.logprobs is not None:
|
||||
t = torch.tensor(
|
||||
[
|
||||
inner.logprobs[i][tid].logprob
|
||||
for i, tid in enumerate(inner.token_ids)
|
||||
],
|
||||
[inner.logprobs[i][tid].logprob for i, tid in enumerate(inner.token_ids)],
|
||||
dtype=torch.float32,
|
||||
)
|
||||
return t, inner.token_ids
|
||||
@@ -95,8 +90,7 @@ def _extract_step_logprobs(request_output):
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Ensures that the same request (the 'needle' prompt) yields identical output
|
||||
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
|
||||
@@ -184,8 +178,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(
|
||||
_random_prompt(min_random_prompt, max_random_prompt))
|
||||
prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
|
||||
|
||||
# Generate with the larger-batch engine
|
||||
outputs = llm.generate(prompts, sampling)
|
||||
@@ -196,27 +189,27 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
text = needle_output.outputs[0].text
|
||||
|
||||
if text != baseline_text:
|
||||
print(
|
||||
f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
|
||||
print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
|
||||
mismatches += 1
|
||||
|
||||
passes = num_trials - mismatches
|
||||
# Dump how many passed vs failed
|
||||
print(f"[determinism] total={num_trials}, passed={passes}, "
|
||||
f"failed={mismatches}, max_batch_size={max_batch_size}")
|
||||
print(
|
||||
f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
|
||||
)
|
||||
|
||||
if mismatches > 0:
|
||||
pytest.fail(
|
||||
f"Nondeterministic outputs detected: {mismatches} failed out "
|
||||
f"of {num_trials} trials (max_batch_size={max_batch_size}).")
|
||||
f"of {num_trials} trials (max_batch_size={max_batch_size})."
|
||||
)
|
||||
|
||||
finally:
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
model_name = DEFAULT_MODEL
|
||||
@@ -230,9 +223,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
|
||||
if disable_custom_ar:
|
||||
print(f"\n{'=' * 80}")
|
||||
print(
|
||||
f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
|
||||
)
|
||||
print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
llm = LLM(
|
||||
@@ -266,15 +257,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
bs1_logprobs_per_prompt = []
|
||||
bs1_tokens_per_prompt = []
|
||||
for idx, p in enumerate(prompts):
|
||||
print(
|
||||
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
|
||||
)
|
||||
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
|
||||
outs = llm.generate([p], sp, use_tqdm=False)
|
||||
assert len(outs) == 1
|
||||
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bs1_logprobs_per_prompt.append(step_logprobs)
|
||||
bs1_tokens_per_prompt.append(token_ids)
|
||||
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
|
||||
@@ -296,108 +284,92 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
|
||||
step_logprobs, token_ids = _extract_step_logprobs(o)
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bsN_logprobs_per_prompt.append(step_logprobs)
|
||||
bsN_tokens_per_prompt.append(token_ids)
|
||||
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
|
||||
failed_prompts = []
|
||||
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)):
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)
|
||||
):
|
||||
if len(logprobs_bs1) != len(logprobs_bsN):
|
||||
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
|
||||
f"vs {len(logprobs_bsN)} (BS=N)")
|
||||
failed_prompts.append({
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if tokens match first
|
||||
if tokens_bs1 != tokens_bsN:
|
||||
failed_prompts.append({
|
||||
"prompt_idx":
|
||||
i,
|
||||
"step":
|
||||
"sampling",
|
||||
"reason":
|
||||
"Different tokens sampled",
|
||||
"prompt_preview":
|
||||
prompts[i][:100],
|
||||
"bs1_tokens":
|
||||
tokens_bs1,
|
||||
"bsN_tokens":
|
||||
tokens_bsN,
|
||||
"bs1_all_logprobs":
|
||||
[logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
|
||||
"bsN_all_logprobs":
|
||||
[logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
|
||||
})
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "sampling",
|
||||
"reason": "Different tokens sampled",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
|
||||
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
|
||||
if a.shape != b.shape:
|
||||
failed_prompts.append({
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
if not torch.equal(a, b):
|
||||
max_diff = torch.abs(a - b).max().item()
|
||||
# Print which token failed
|
||||
print(
|
||||
f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
|
||||
)
|
||||
print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
|
||||
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
|
||||
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
|
||||
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
|
||||
print(f" BS=1 logprob: {a.tolist()}")
|
||||
print(f" BS=N logprob: {b.tolist()}")
|
||||
failed_prompts.append({
|
||||
"prompt_idx":
|
||||
i,
|
||||
"step":
|
||||
t,
|
||||
"reason":
|
||||
f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview":
|
||||
prompts[i][:100],
|
||||
"bs1_tokens":
|
||||
tokens_bs1,
|
||||
"bsN_tokens":
|
||||
tokens_bsN,
|
||||
"bs1_all_logprobs": [
|
||||
logprobs_bs1[s].tolist()
|
||||
for s in range(len(logprobs_bs1))
|
||||
],
|
||||
"bsN_all_logprobs": [
|
||||
logprobs_bsN[s].tolist()
|
||||
for s in range(len(logprobs_bsN))
|
||||
],
|
||||
})
|
||||
failed_prompts.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
|
||||
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
|
||||
}
|
||||
)
|
||||
break
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
# Print summary of all failures
|
||||
if failed_prompts:
|
||||
print(f"\n{'=' * 80}")
|
||||
fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
|
||||
f"{len(prompts)} prompts failed")
|
||||
fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
|
||||
print(fail_msg)
|
||||
print(f"{'=' * 80}")
|
||||
for fail in failed_prompts:
|
||||
@@ -412,21 +384,18 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
print(f" BS=N tokens: {fail['bsN_tokens']}")
|
||||
|
||||
if "bs1_all_logprobs" in fail:
|
||||
print(
|
||||
f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
|
||||
)
|
||||
print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
|
||||
for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
|
||||
print(f" Step {step_idx}: {logprobs}")
|
||||
print(
|
||||
f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
|
||||
)
|
||||
print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
|
||||
for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
|
||||
print(f" Step {step_idx}: {logprobs}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
# Fail the test with summary
|
||||
msg = (f"Batch invariance violated in {len(failed_prompts)}/"
|
||||
f"{len(prompts)} prompts. See output above for details.")
|
||||
msg = (
|
||||
f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
|
||||
)
|
||||
pytest.fail(msg)
|
||||
|
||||
|
||||
@@ -476,8 +445,7 @@ def test_simple_generation(monkeypatch: pytest.MonkeyPatch):
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def test_logprobs_without_batch_invariance_should_fail(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
def test_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
|
||||
It DISABLES batch invariance mode and expects to see non-deterministic behavior
|
||||
@@ -540,15 +508,12 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
bs1_logprobs_per_prompt = []
|
||||
bs1_tokens_per_prompt = []
|
||||
for idx, p in enumerate(prompts):
|
||||
print(
|
||||
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
|
||||
)
|
||||
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
|
||||
outs = llm.generate([p], sp, use_tqdm=False)
|
||||
assert len(outs) == 1
|
||||
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bs1_logprobs_per_prompt.append(step_logprobs)
|
||||
bs1_tokens_per_prompt.append(token_ids)
|
||||
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
|
||||
@@ -569,74 +534,80 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
|
||||
step_logprobs, token_ids = _extract_step_logprobs(o)
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
|
||||
bsN_logprobs_per_prompt.append(step_logprobs)
|
||||
bsN_tokens_per_prompt.append(token_ids)
|
||||
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
|
||||
differences_found = []
|
||||
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)):
|
||||
zip(
|
||||
bs1_logprobs_per_prompt,
|
||||
bsN_logprobs_per_prompt,
|
||||
bs1_tokens_per_prompt,
|
||||
bsN_tokens_per_prompt,
|
||||
)
|
||||
):
|
||||
if len(logprobs_bs1) != len(logprobs_bsN):
|
||||
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
|
||||
f"vs {len(logprobs_bsN)} (BS=N)")
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "all",
|
||||
"reason": reason,
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if tokens match first
|
||||
if tokens_bs1 != tokens_bsN:
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": "sampling",
|
||||
"reason": "Different tokens sampled",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": "sampling",
|
||||
"reason": "Different tokens sampled",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
|
||||
if a.shape != b.shape:
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
if not torch.equal(a, b):
|
||||
max_diff = torch.abs(a - b).max().item()
|
||||
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
|
||||
f"Token {t}: max_diff={max_diff:.6e}")
|
||||
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
|
||||
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
|
||||
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
|
||||
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
|
||||
print(f" BS=1 logprob: {a.tolist()}")
|
||||
print(f" BS=N logprob: {b.tolist()}")
|
||||
differences_found.append({
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
})
|
||||
differences_found.append(
|
||||
{
|
||||
"prompt_idx": i,
|
||||
"step": t,
|
||||
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
|
||||
"prompt_preview": prompts[i][:100],
|
||||
"bs1_tokens": tokens_bs1,
|
||||
"bsN_tokens": tokens_bsN,
|
||||
}
|
||||
)
|
||||
break
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
@@ -646,7 +617,8 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
success_msg = (
|
||||
f"✓ SUCCESS: Batch invariance is doing something! "
|
||||
f"Found {len(differences_found)}/{len(prompts)} prompts "
|
||||
f"with differences when batch invariance was DISABLED.")
|
||||
f"with differences when batch invariance was DISABLED."
|
||||
)
|
||||
print(success_msg)
|
||||
print(f"{'=' * 80}")
|
||||
for diff in differences_found:
|
||||
@@ -666,7 +638,8 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
|
||||
f"between BS=1 and BS=N even with batch invariance DISABLED. "
|
||||
f"This suggests batch invariance might not be necessary, "
|
||||
f"or the test needs more sensitive prompts.")
|
||||
f"or the test needs more sensitive prompts."
|
||||
)
|
||||
print(fail_msg)
|
||||
print(f"{'=' * 80}\n")
|
||||
pytest.fail(fail_msg)
|
||||
|
||||
@@ -37,10 +37,7 @@ def test_end_to_end():
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
enable_sleep_mode=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True, cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
|
||||
output = runner.model.generate(prompt, sampling_params)
|
||||
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
|
||||
# which is difficult to measure in the test. therefore, we only
|
||||
|
||||
@@ -30,9 +30,7 @@ MODELS = ["Qwen/Qwen3-0.6B"]
|
||||
|
||||
def get_prompt_embeds(chat, tokenizer, embedding_layer):
|
||||
"""Convert chat messages to prompt embeddings."""
|
||||
token_ids = tokenizer.apply_chat_template(chat,
|
||||
add_generation_prompt=True,
|
||||
return_tensors='pt')
|
||||
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt")
|
||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||
return prompt_embeds
|
||||
|
||||
@@ -53,15 +51,16 @@ def test_mixed_prompt_embeds_and_text(model_name):
|
||||
|
||||
# Run inference with mixed inputs
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
enable_prompt_embeds=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
model_name,
|
||||
enable_prompt_embeds=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as vllm_runner:
|
||||
# Test prompt embeddings
|
||||
embeds_output = vllm_runner.model.generate({
|
||||
"prompt_embeds":
|
||||
prompt_embeds,
|
||||
})
|
||||
embeds_output = vllm_runner.model.generate(
|
||||
{
|
||||
"prompt_embeds": prompt_embeds,
|
||||
}
|
||||
)
|
||||
|
||||
# Test text prompt
|
||||
text_output = vllm_runner.model.generate(text_prompt)
|
||||
|
||||
@@ -107,15 +107,13 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
|
||||
|
||||
def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
cpu_block_size = (llm.llm_engine.vllm_config.kv_transfer_config.
|
||||
kv_connector_extra_config["block_size"])
|
||||
cpu_block_size = llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config["block_size"]
|
||||
|
||||
subscriber.get_new_cpu_stored_events()
|
||||
|
||||
# prepend prompt to be cpu block aligned
|
||||
prompt = "Let's count to 10. One, two, three, four,"
|
||||
while (len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) %
|
||||
cpu_block_size != 0):
|
||||
while len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size != 0:
|
||||
prompt = ". " + prompt
|
||||
|
||||
assert subscriber.get_new_cpu_stored_events()
|
||||
@@ -123,8 +121,7 @@ def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
|
||||
test_count = 100
|
||||
success_count = 0
|
||||
for i in range(test_count):
|
||||
if (llm.generate(prompt, sampling_params,
|
||||
use_tqdm=False)[0].outputs[0].text == " five"):
|
||||
if llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text == " five":
|
||||
success_count += 1
|
||||
|
||||
assert success_count >= 0.5 * test_count
|
||||
@@ -143,7 +140,7 @@ def test_cpu_offloading() -> None:
|
||||
"num_cpu_blocks": 1000,
|
||||
"block_size": 128,
|
||||
"spec_name": "NPUOffloadingSpec",
|
||||
"spec_module_path": "vllm_ascend.kv_offload.npu"
|
||||
"spec_module_path": "vllm_ascend.kv_offload.npu",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
from typing import Any, Dict
|
||||
from typing import Any
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
@@ -34,8 +34,10 @@ GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def sample_regex():
|
||||
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
|
||||
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
|
||||
return (
|
||||
r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
|
||||
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -43,66 +45,41 @@ def sample_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"age": {
|
||||
"type": "integer"
|
||||
},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"maxLength": 10
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
"skills": {"type": "array", "items": {"type": "string", "maxLength": 10}, "minItems": 3},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string"
|
||||
},
|
||||
"duration": {
|
||||
"type": "number"
|
||||
},
|
||||
"position": {
|
||||
"type": "string"
|
||||
}
|
||||
"company": {"type": "string"},
|
||||
"duration": {"type": "number"},
|
||||
"position": {"type": "string"},
|
||||
},
|
||||
"required": ["company", "position"]
|
||||
}
|
||||
}
|
||||
"required": ["company", "position"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"]
|
||||
"required": ["name", "age", "skills", "work_history"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
|
||||
def test_guided_json_completion(guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
runner_kwargs: Dict[str, Any] = {}
|
||||
def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema):
|
||||
runner_kwargs: dict[str, Any] = {}
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
structured_outputs=StructuredOutputsParams(json=sample_json_schema))
|
||||
temperature=1.0, max_tokens=500, structured_outputs=StructuredOutputsParams(json=sample_json_schema)
|
||||
)
|
||||
runner_kwargs = {
|
||||
"cudagraph_capture_sizes": [1, 2, 4, 8],
|
||||
"seed": 0,
|
||||
"structured_outputs_config": {
|
||||
"backend": guided_decoding_backend
|
||||
},
|
||||
"structured_outputs_config": {"backend": guided_decoding_backend},
|
||||
}
|
||||
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
|
||||
prompts = [
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
] * 2
|
||||
prompts = [f"Give an example JSON for an employee profile that fits this schema: {sample_json_schema}"] * 2
|
||||
inputs = vllm_model.get_inputs(prompts)
|
||||
outputs = vllm_model.model.generate(inputs,
|
||||
sampling_params=sampling_params)
|
||||
outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
@@ -115,34 +92,27 @@ def test_guided_json_completion(guided_decoding_backend: str,
|
||||
assert generated_text is not None
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json,
|
||||
schema=sample_json_schema)
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
|
||||
def test_guided_regex(guided_decoding_backend: str, sample_regex):
|
||||
if guided_decoding_backend == "outlines":
|
||||
pytest.skip("Outlines doesn't support regex-based guided decoding.")
|
||||
runner_kwargs: Dict[str, Any] = {}
|
||||
runner_kwargs: dict[str, Any] = {}
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
structured_outputs=StructuredOutputsParams(regex=sample_regex))
|
||||
temperature=0.8, top_p=0.95, structured_outputs=StructuredOutputsParams(regex=sample_regex)
|
||||
)
|
||||
runner_kwargs = {
|
||||
"cudagraph_capture_sizes": [1, 2, 4, 8],
|
||||
"seed": 0,
|
||||
"structured_outputs_config": {
|
||||
"backend": guided_decoding_backend
|
||||
},
|
||||
"structured_outputs_config": {"backend": guided_decoding_backend},
|
||||
}
|
||||
|
||||
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
|
||||
prompts = [
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
] * 2
|
||||
prompts = [f"Give an example IPv4 address with this regex: {sample_regex}"] * 2
|
||||
inputs = vllm_model.get_inputs(prompts)
|
||||
outputs = vllm_model.model.generate(inputs,
|
||||
sampling_params=sampling_params)
|
||||
outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
|
||||
@@ -19,20 +19,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What are all distinct countries where singers above age 20 are from?" # noqa: E501
|
||||
query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
prompts, sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -45,16 +41,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
with VllmRunner(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
dtype="half",
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
dtype="half",
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
|
||||
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import vllm
|
||||
import vllm.config
|
||||
from vllm.lora.request import LoRARequest
|
||||
from unittest.mock import patch
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from vllm_ascend.utils import enable_custom_op
|
||||
@@ -53,17 +53,12 @@ def do_sample(
|
||||
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
|
||||
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context=
|
||||
"Which poll resource provided the most number of candidate information?" # noqa: E501
|
||||
context="Which poll resource provided the most number of candidate information?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context=
|
||||
"Return the poll resource associated with the most candidates."),
|
||||
PROMPT_TEMPLATE.format(context="Return the poll resource associated with the most candidates."),
|
||||
]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=64,
|
||||
stop=["<|im_end|>"])
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64, stop=["<|im_end|>"])
|
||||
if tensorizer_config_dict is not None:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
@@ -73,14 +68,15 @@ def do_sample(
|
||||
lora_id,
|
||||
lora_path,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
) if lora_id else None,
|
||||
)
|
||||
if lora_id
|
||||
else None,
|
||||
)
|
||||
else:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
|
||||
generated_texts: list[str] = []
|
||||
@@ -92,33 +88,40 @@ def do_sample(
|
||||
return generated_texts
|
||||
|
||||
|
||||
def generate_and_test(llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict: dict | None = None):
|
||||
def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | None = None):
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
assert (do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1,
|
||||
) == EXPECTED_LORA_OUTPUT)
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1,
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
print("lora 2")
|
||||
assert (do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2,
|
||||
) == EXPECTED_LORA_OUTPUT)
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2,
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
print("base model")
|
||||
assert (do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=0,
|
||||
) == EXPECTED_BASE_MODEL_OUTPUT)
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=0,
|
||||
)
|
||||
== EXPECTED_BASE_MODEL_OUTPUT
|
||||
)
|
||||
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@@ -45,9 +45,7 @@ def test_minicpm(model) -> None:
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(model,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
with VllmRunner(model, max_model_len=512, gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@@ -56,19 +54,12 @@ def test_whisper(model) -> None:
|
||||
prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"]
|
||||
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
|
||||
|
||||
with VllmRunner(model,
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
dtype="bfloat16",
|
||||
block_size=128,
|
||||
gpu_memory_utilization=0.9) as runner:
|
||||
outputs = runner.generate(prompts=prompts,
|
||||
audios=audios,
|
||||
sampling_params=sampling_params)
|
||||
with VllmRunner(
|
||||
model, max_model_len=448, max_num_seqs=5, dtype="bfloat16", block_size=128, gpu_memory_utilization=0.9
|
||||
) as runner:
|
||||
outputs = runner.generate(prompts=prompts, audios=audios, sampling_params=sampling_params)
|
||||
|
||||
assert outputs is not None, "Generated outputs should not be None."
|
||||
assert len(outputs) > 0, "Generated outputs should not be empty."
|
||||
|
||||
@@ -39,59 +39,56 @@ def test_models_with_multistream_overlap_shared_expert(
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
cudagraph_capture_sizes=[4, 8, 16, 32],
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
cudagraph_capture_sizes=[4, 8, 16, 32],
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
vllm_moe_ms_eager_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
vllm_moe_ms_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 16, 32],
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 16, 32],
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
vllm_moe_ms_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
vllm_moe_ms_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
cudagraph_capture_sizes=[4, 8, 16, 32],
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
cudagraph_capture_sizes=[4, 8, 16, 32],
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_moe_ms_eager_outputs_list = []
|
||||
for output in vllm_moe_ms_eager_outputs:
|
||||
vllm_moe_ms_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_moe_ms_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_moe_ms_aclgraph_outputs_list = []
|
||||
for output in vllm_moe_ms_aclgraph_outputs:
|
||||
vllm_moe_ms_aclgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_moe_ms_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
|
||||
@@ -19,6 +19,7 @@ from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
|
||||
# fmt: off
|
||||
def test_qwen3_w8a8_quant():
|
||||
max_tokens = 5
|
||||
example_prompts = [
|
||||
@@ -29,6 +30,7 @@ def test_qwen3_w8a8_quant():
|
||||
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
|
||||
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
|
||||
)]
|
||||
# fmt: on
|
||||
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-0.6B-W8A8",
|
||||
@@ -47,7 +49,7 @@ def test_qwen3_w8a8_quant():
|
||||
name_1="vllm_quant_w8a8_outputs",
|
||||
)
|
||||
|
||||
|
||||
# fmt: off
|
||||
def test_qwen3_dense_w8a16():
|
||||
max_tokens = 5
|
||||
example_prompts = [
|
||||
@@ -58,6 +60,7 @@ def test_qwen3_dense_w8a16():
|
||||
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
|
||||
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
|
||||
)]
|
||||
# fmt: on
|
||||
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-0.6B-W8A16",
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from unittest.mock import patch
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
from unittest.mock import patch
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from vllm_ascend.utils import enable_custom_op
|
||||
@@ -27,16 +28,11 @@ LORA_TEST_EXPECTED = [
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
return [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
},
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
|
||||
@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
|
||||
def test_multi_loras_with_tp_sync():
|
||||
lora_name_id_map = {}
|
||||
@@ -102,9 +98,7 @@ def test_multi_loras_with_tp_sync():
|
||||
outputs = llm.chat(
|
||||
[messages],
|
||||
sampling_params,
|
||||
chat_template_kwargs={
|
||||
"enable_thinking": False
|
||||
}, # for those loras, ensure enable_thinking=False
|
||||
chat_template_kwargs={"enable_thinking": False}, # for those loras, ensure enable_thinking=False
|
||||
lora_request=lora_request,
|
||||
use_tqdm=False,
|
||||
)
|
||||
@@ -113,15 +107,13 @@ def test_multi_loras_with_tp_sync():
|
||||
|
||||
def reload_lora(name: str):
|
||||
"""
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
for dynamic lora loading and unloading
|
||||
"""
|
||||
remove_lora_response = llm.llm_engine.remove_lora(
|
||||
lora_id=lora_name_id_map[name])
|
||||
remove_lora_response = llm.llm_engine.remove_lora(lora_id=lora_name_id_map[name])
|
||||
|
||||
add_lora_response = llm.llm_engine.add_lora(
|
||||
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
|
||||
add_lora_response = llm.llm_engine.add_lora(make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
|
||||
|
||||
print(f"{remove_lora_response=}, {add_lora_response=}")
|
||||
|
||||
@@ -131,7 +123,6 @@ def test_multi_loras_with_tp_sync():
|
||||
assert outputs == expected
|
||||
|
||||
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output, prompt)
|
||||
|
||||
|
||||
@@ -25,15 +25,11 @@ def test_qwen3_topk() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=8192,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
|
||||
) as runner:
|
||||
runner.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -42,29 +38,25 @@ def test_qwen3_prompt_logprobs() -> None:
|
||||
"Hello, my name is",
|
||||
]
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=8192,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens=5,
|
||||
num_logprobs=1)
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
|
||||
) as runner:
|
||||
runner.generate_greedy_logprobs(example_prompts, max_tokens=5, num_logprobs=1)
|
||||
|
||||
|
||||
def test_qwen3_exponential_overlap() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=1.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=1.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=8192,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7,
|
||||
additional_config={
|
||||
"enable_async_exponential": True,
|
||||
}) as runner:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-0.6B",
|
||||
max_model_len=8192,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7,
|
||||
additional_config={
|
||||
"enable_async_exponential": True,
|
||||
},
|
||||
) as runner:
|
||||
runner.generate(example_prompts, sampling_params)
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
@@ -44,11 +45,13 @@ def test_multimodal_vl(vl_config):
|
||||
images = [image] * len(img_questions)
|
||||
prompts = vl_config["prompt_fn"](img_questions)
|
||||
|
||||
with VllmRunner(vl_config["model"],
|
||||
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
|
||||
max_model_len=8192,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
limit_mm_per_prompt={"image": 1}) as vllm_model:
|
||||
with VllmRunner(
|
||||
vl_config["model"],
|
||||
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
|
||||
max_model_len=8192,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(
|
||||
prompts=prompts,
|
||||
images=images,
|
||||
@@ -63,35 +66,30 @@ def test_multimodal_vl(vl_config):
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_WORKER_MULTIPROC_METHOD": "spawn"})
|
||||
def test_multimodal_audio():
|
||||
audio_prompt = "".join([
|
||||
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
||||
for idx in range(2)
|
||||
])
|
||||
audio_prompt = "".join([f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(2)])
|
||||
question = "What sport and what nursery rhyme are referenced?"
|
||||
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
prompt = (
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in
|
||||
[AudioAsset("mary_had_lamb"),
|
||||
AudioAsset("winning_call")]
|
||||
]
|
||||
"audio": [asset.audio_and_sample_rate for asset in [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]]
|
||||
}
|
||||
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
|
||||
|
||||
with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.9) as runner:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.9,
|
||||
) as runner:
|
||||
outputs = runner.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
assert outputs is not None, "Generated outputs should not be None."
|
||||
|
||||
@@ -20,13 +20,14 @@ Compare the outputs of vLLM with and without xlite.
|
||||
Run `pytest tests/e2e/singlecard/test_xlite.py`.
|
||||
"""
|
||||
|
||||
# ruff: noqa: E501
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.singlecard.utils import (PROMPTS_SHORT, LLMTestCase,
|
||||
gen_and_valid)
|
||||
from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
|
||||
|
||||
os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"
|
||||
|
||||
@@ -35,9 +36,9 @@ CASE_DECODE_ONLY = LLMTestCase(
|
||||
prompts=PROMPTS_SHORT,
|
||||
golden_answers=[
|
||||
"Hello, my name is Lina. I'm a 22-year-old student from China.",
|
||||
'The president of the United States is the same as the president of the United Nations. This is because the president',
|
||||
'The capital of France is Paris. The capital of France is also the capital of the French Republic.',
|
||||
'The future of AI is not just a technological challenge but a profound transformation of how we live, work'
|
||||
"The president of the United States is the same as the president of the United Nations. This is because the president",
|
||||
"The capital of France is Paris. The capital of France is also the capital of the French Republic.",
|
||||
"The future of AI is not just a technological challenge but a profound transformation of how we live, work",
|
||||
],
|
||||
sampling_params=SamplingParams(
|
||||
max_tokens=15,
|
||||
@@ -45,19 +46,22 @@ CASE_DECODE_ONLY = LLMTestCase(
|
||||
top_p=1.0,
|
||||
top_k=0,
|
||||
n=1,
|
||||
))
|
||||
),
|
||||
)
|
||||
|
||||
CASE_FULL = LLMTestCase(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
prompts=[
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
],
|
||||
golden_answers=[
|
||||
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
|
||||
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
|
||||
' Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
|
||||
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and"
|
||||
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
|
||||
" Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
|
||||
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
|
||||
],
|
||||
sampling_params=SamplingParams(
|
||||
max_tokens=32,
|
||||
@@ -65,27 +69,25 @@ CASE_FULL = LLMTestCase(
|
||||
top_p=1.0,
|
||||
top_k=0,
|
||||
n=1,
|
||||
))
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
|
||||
@pytest.mark.skip(reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
|
||||
@pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY])
|
||||
def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
|
||||
runner_kwargs = {
|
||||
"model_name": cur_case.model,
|
||||
"max_model_len": 1024,
|
||||
"block_size": 128,
|
||||
"additional_config": {
|
||||
"xlite_graph_config": {
|
||||
"enabled": True
|
||||
}
|
||||
},
|
||||
"additional_config": {"xlite_graph_config": {"enabled": True}},
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cur_case", [CASE_FULL])
|
||||
@@ -94,14 +96,11 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
|
||||
"model_name": cur_case.model,
|
||||
"max_model_len": 1024,
|
||||
"block_size": 128,
|
||||
"additional_config": {
|
||||
"xlite_graph_config": {
|
||||
"enabled": True,
|
||||
"full_mode": True
|
||||
}
|
||||
},
|
||||
"additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
|
||||
}
|
||||
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers)
|
||||
gen_and_valid(
|
||||
runner_kwargs=runner_kwargs,
|
||||
prompts=cur_case.prompts,
|
||||
sampling_params=cur_case.sampling_params,
|
||||
golden_answers=cur_case.golden_answers,
|
||||
)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -7,37 +6,44 @@ from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
PROMPTS_SHORT = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# NOTE: Randomly fill the prompt with the requested amount for
|
||||
# the specified capture shape to prevent accuracy issues caused by padding
|
||||
PROMPTS_LONG = [
|
||||
('Solve the following math problem step by step.'
|
||||
'The last line of your response should be of the form Answer: '
|
||||
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||
'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
|
||||
'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
|
||||
'$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
|
||||
'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
|
||||
'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
|
||||
),
|
||||
('Solve the following math problem step by step.'
|
||||
'The last line of your response should be of the form Answer: '
|
||||
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||
'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
|
||||
'independently and uniformly at random on the perimeter of $ABCD$.'
|
||||
'If the expected value of the area of triangle $\\triangle AXY$'
|
||||
'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
|
||||
'integers $m$ and $n$, compute $m+n$.'),
|
||||
('Solve the following math problem step by step.'
|
||||
'The last line of your response should be of the form Answer: '
|
||||
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
|
||||
'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
|
||||
'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
|
||||
'and $x^2 + cx + b = 0$ also have a common real root.'
|
||||
'Compute the sum $a + b + c$.')
|
||||
(
|
||||
"Solve the following math problem step by step."
|
||||
"The last line of your response should be of the form Answer: "
|
||||
"$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
|
||||
"In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$"
|
||||
"be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,"
|
||||
"$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$."
|
||||
"If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,"
|
||||
"where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$."
|
||||
),
|
||||
(
|
||||
"Solve the following math problem step by step."
|
||||
"The last line of your response should be of the form Answer: "
|
||||
"$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
|
||||
"Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen"
|
||||
"independently and uniformly at random on the perimeter of $ABCD$."
|
||||
"If the expected value of the area of triangle $\\triangle AXY$"
|
||||
"can be expressed as $\\frac{m}{n}$, for relatively prime positive"
|
||||
"integers $m$ and $n$, compute $m+n$."
|
||||
),
|
||||
(
|
||||
"Solve the following math problem step by step."
|
||||
"The last line of your response should be of the form Answer: "
|
||||
"$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
|
||||
"Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$"
|
||||
"and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$"
|
||||
"and $x^2 + cx + b = 0$ also have a common real root."
|
||||
"Compute the sum $a + b + c$."
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -46,7 +52,7 @@ class LLMTestCase:
|
||||
model: str
|
||||
prompts: list[str]
|
||||
golden_answers: list[str]
|
||||
quantization: Optional[str] = None
|
||||
quantization: str | None = None
|
||||
sampling_params: SamplingParams = field(
|
||||
default_factory=lambda: SamplingParams(
|
||||
max_tokens=32,
|
||||
@@ -54,14 +60,13 @@ class LLMTestCase:
|
||||
top_p=1.0,
|
||||
top_k=0,
|
||||
n=1,
|
||||
))
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def gen_and_valid(runner_kwargs: dict, prompts: list[str],
|
||||
sampling_params: SamplingParams, golden_answers: list[str]):
|
||||
def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]):
|
||||
with VllmRunner(**runner_kwargs) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
prompts=prompts, sampling_params=sampling_params)
|
||||
vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params)
|
||||
outputs_gen = []
|
||||
for output in vllm_aclgraph_outputs:
|
||||
outputs_gen.append(([output.outputs[0].index], output.outputs[0].text))
|
||||
|
||||
Reference in New Issue
Block a user