[Lint]Style: Convert test/ to ruff format(Batch #5) (#6747)

### What this PR does / why we need it?
| File Path |
| :--- |
| `tests/e2e/singlecard/compile/backend.py` |
| `tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py` |
| `tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py` |
| `tests/e2e/singlecard/compile/test_norm_quant_fusion.py` |
| `tests/e2e/singlecard/model_runner_v2/test_basic.py` |
| `tests/e2e/singlecard/test_aclgraph_accuracy.py` |
| `tests/e2e/singlecard/test_aclgraph_batch_invariant.py` |
| `tests/e2e/singlecard/test_aclgraph_mem.py` |
| `tests/e2e/singlecard/test_async_scheduling.py` |
| `tests/e2e/singlecard/test_auto_fit_max_mode_len.py` |
| `tests/e2e/singlecard/test_batch_invariant.py` |
| `tests/e2e/singlecard/test_camem.py` |
| `tests/e2e/singlecard/test_completion_with_prompt_embeds.py` |
| `tests/e2e/singlecard/test_cpu_offloading.py` |
| `tests/e2e/singlecard/test_guided_decoding.py` |
| `tests/e2e/singlecard/test_ilama_lora.py` |
| `tests/e2e/singlecard/test_llama32_lora.py` |
| `tests/e2e/singlecard/test_models.py` |
| `tests/e2e/singlecard/test_multistream_overlap_shared_expert.py` |
| `tests/e2e/singlecard/test_quantization.py` |
| `tests/e2e/singlecard/test_qwen3_multi_loras.py` |
| `tests/e2e/singlecard/test_sampler.py` |
| `tests/e2e/singlecard/test_vlm.py` |
| `tests/e2e/singlecard/test_xlite.py` |
| `tests/e2e/singlecard/utils.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
SILONG ZENG
2026-02-24 15:50:00 +08:00
committed by GitHub
parent 747484cb64
commit 62ea664aa7
26 changed files with 859 additions and 1052 deletions

View File

@@ -14,25 +14,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import pytest
import torch
import torch.nn as nn
import torch_npu
import vllm.config
from vllm.config import ModelConfig, VllmConfig
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
from vllm.utils.system_utils import update_environment_variables
import vllm_ascend.ops.register_custom_ops # noqa
from tests.e2e.singlecard.compile.backend import TestBackend
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import \
AddRMSNormQuantFusionPass
from vllm_ascend.utils import enable_custom_op
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass
from vllm_ascend.utils import enable_custom_op, vllm_version_is
if vllm_version_is("0.15.0"):
from vllm.compilation.fx_utils import OpOverload # type: ignore
@@ -48,34 +43,24 @@ def get_or_create_backend(vllm_config):
"""Get or create backend with fusion passes (cached to avoid duplicate pattern registration)."""
global _backend_cache
if _backend_cache is None:
_backend_cache = TestBackend(custom_passes=[
AddRMSNormQuantFusionPass(vllm_config=vllm_config)
])
_backend_cache = TestBackend(custom_passes=[AddRMSNormQuantFusionPass(vllm_config=vllm_config)])
return _backend_cache
class TestModelWithoutBias(nn.Module):
"""
A minimal test model that simulates the pattern:
AddRMSNorm → Quantization (without bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -87,23 +72,20 @@ class TestModelWithoutBias(nn.Module):
residual = torch.zeros_like(x)
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, None, self.eps)
x, residual, self.rms_norm_weight, None, self.eps
)
quantized_output = torch.ops.vllm.quantize(norm_output,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.quantize.default
]
return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [torch.ops.npu.npu_add_rms_norm_quant.default]
@@ -114,24 +96,15 @@ class TestModelWithBias(nn.Module):
AddRMSNorm → Add Bias → Quantization (with bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -144,23 +117,20 @@ class TestModelWithBias(nn.Module):
residual = torch.zeros_like(x)
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, self.bias, self.eps)
x, residual, self.rms_norm_weight, self.bias, self.eps
)
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.quantize.default
]
return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [torch.ops.npu.npu_add_rms_norm_quant.default]
@@ -171,23 +141,14 @@ class TestModelSPWithoutBias(nn.Module):
AddRMSNorm → maybe_allgather → Quantization (without bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -200,32 +161,28 @@ class TestModelSPWithoutBias(nn.Module):
residual = torch.zeros_like(x)
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, None, self.eps)
x, residual, self.rms_norm_weight, None, self.eps
)
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
norm_output, True)
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output, True)
quantized_output = torch.ops.vllm.quantize(norm_output,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
torch.ops.vllm.quantize.default
torch.ops.vllm.quantize.default,
]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [
torch.ops.npu.npu_add_rms_norm_quant.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
]
return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
class TestModelSPWithBias(nn.Module):
@@ -234,24 +191,15 @@ class TestModelSPWithBias(nn.Module):
AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -265,32 +213,28 @@ class TestModelSPWithBias(nn.Module):
residual = torch.zeros_like(x)
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, self.bias, self.eps)
x, residual, self.rms_norm_weight, self.bias, self.eps
)
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
norm_output_with_bias, True)
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output_with_bias, True)
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
torch.ops.vllm.quantize.default
torch.ops.vllm.quantize.default,
]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [
torch.ops.npu.npu_add_rms_norm_quant.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
]
return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@@ -317,58 +261,42 @@ def test_rmsnorm_quant_fusion(
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
with vllm.config.set_current_vllm_config(vllm_config):
update_environment_variables({
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
"MASTER_ADDR": "localhost",
"MASTER_PORT": "12345",
})
update_environment_variables(
{
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
"MASTER_ADDR": "localhost",
"MASTER_PORT": "12345",
}
)
init_distributed_environment()
ensure_model_parallel_initialized(1, 1)
with vllm.config.set_current_vllm_config(vllm_config):
with set_ascend_forward_context(None, vllm_config):
backend = get_or_create_backend(vllm_config)
if use_bias:
if not enable_custom_op():
return
if sp_enable:
model = TestModelSPWithBias(hidden_size,
dtype,
eps,
device="npu")
else:
model = TestModelWithBias(hidden_size,
dtype,
eps,
device="npu")
with vllm.config.set_current_vllm_config(vllm_config), set_ascend_forward_context(None, vllm_config):
backend = get_or_create_backend(vllm_config)
if use_bias:
if not enable_custom_op():
return
if sp_enable:
model = TestModelSPWithBias(hidden_size, dtype, eps, device="npu")
else:
if sp_enable:
model = TestModelSPWithoutBias(hidden_size,
dtype,
eps,
device="npu")
else:
model = TestModelWithoutBias(hidden_size,
dtype,
eps,
device="npu")
model = model.to("npu")
model = TestModelWithBias(hidden_size, dtype, eps, device="npu")
else:
if sp_enable:
model = TestModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
else:
model = TestModelWithoutBias(hidden_size, dtype, eps, device="npu")
model = model.to("npu")
x = torch.rand(num_tokens,
hidden_size,
device="npu",
dtype=dtype,
requires_grad=False)
x = torch.rand(num_tokens, hidden_size, device="npu", dtype=dtype, requires_grad=False)
result_unfused = model(x)
print("Unfused result:", [t.shape for t in result_unfused])
model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x)
print("Fused result:", [t.shape for t in result_fused])
result_unfused = model(x)
print("Unfused result:", [t.shape for t in result_unfused])
model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x)
print("Fused result:", [t.shape for t in result_fused])
print("=== Checking operator fusion ===")
backend.check_before_ops(model.ops_in_model_before(),
fully_replaced=not sp_enable)
backend.check_after_ops(model.ops_in_model_after())
print("=== Checking operator fusion ===")
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=not sp_enable)
backend.check_after_ops(model.ops_in_model_after())