[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
SILONG ZENG
2026-03-10 09:52:50 +08:00
committed by GitHub
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions

View File

@@ -11,11 +11,14 @@ MODELS = [
# for MHA
"Qwen/Qwen3-8B",
# for MLA
"deepseek-ai/DeepSeek-V2-Lite-Chat"
"deepseek-ai/DeepSeek-V2-Lite-Chat",
]
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
# ruff: noqa: E501
LONG_PROMPT = (
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+ """
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
"""
)
INPUT_PROMPTS = [
LONG_PROMPT +
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT +
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
with VllmRunner(model,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
with VllmRunner(
model,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7,
) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
enable_prefix_caching=False,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model:
with VllmRunner(
model,
enable_prefix_caching=False,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
check_outputs_equal(