[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
SILONG ZENG
2026-03-10 09:52:50 +08:00
committed by GitHub
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions

View File

@@ -26,41 +26,39 @@ from tests.e2e.model_utils import check_outputs_equal
def test_qwen3_moe_full_decode_only_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
@@ -72,41 +70,39 @@ def test_qwen3_moe_full_decode_only_tp2():
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
def test_qwen3_moe_full_graph_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_mode": "FULL",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,