[Bug] Fix bug in test_chunked.py (#1992)

### What this PR does / why we need it? 1. Remove the return statement, it will always skip following logic. 2. Update `deepseek` to `Qwen2.5-Instruct` for OOM in github e2e test env. 3. Fix the comparison logic ### Does this PR introduce _any_ user-facing change? NO. ### How was this patch tested? Local Test. - vLLM version: v0.10.0 - vLLM main: 0933f9d518 Signed-off-by: xleoken <xleoken@163.com>
2025-08-19 10:23:47 +08:00
parent 27d038dc66
commit 2a763b8326
1 changed files with 35 additions and 27 deletions
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -19,12 +19,13 @@ Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/compile/test_aclgraph.py`.
 """
 import pytest
 import torch
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
-MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
+from tests.e2e.conftest import VllmRunner
 MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
@pytest.mark.parametrize("model", MODELS)
@@ -32,36 +33,43 @@ MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
 def test_models(
    model: str,
    max_tokens: int,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    return
+    prompts = ["The president of the United States is"]
    prompts = "The president of the United States is"
    sampling_params = SamplingParams(
        max_tokens=max_tokens,
        temperature=0.0,
    )
-    vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True)
+    with VllmRunner(model, long_prefill_token_threshold=20,
-    output_chunked = vllm_model.generate(prompts, sampling_params)
+                    enforce_eager=True) as vllm_model:
-    logprobs_chunked = output_chunked.outputs[0].logprobs
+        output1 = vllm_model.generate(prompts, sampling_params)
    del vllm_model
    torch.npu.empty_cache()
-    vllm_model = LLM(model,
+    with VllmRunner(model,
-                     enforce_eager=True,
+                    enforce_eager=True,
-                     additional_config={
+                    additional_config={
-                         'ascend_scheduler_config': {
+                        'ascend_scheduler_config': {
-                             'enabled': True
+                            'enabled': True
-                         },
+                        },
-                     })
+                    }) as vllm_model:
-    output = vllm_model.generate(prompts, sampling_params)
+        output2 = vllm_model.generate(prompts, sampling_params)
    logprobs = output.outputs[0].logprobs
    del vllm_model
    torch.npu.empty_cache()
-    logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(),
+    # Extract the generated token IDs for comparison
-                                                  logprobs.flatten(),
+    token_ids1 = output1[0][0][0]
-                                                  dim=0)
+    token_ids2 = output2[0][0][0]
-    assert logprobs_similarity > 0.95
+
    print(f"Token IDs 1: {token_ids1}")
    print(f"Token IDs 2: {token_ids2}")
    # Convert token IDs to tensors and calculate cosine similarity
    # Take the length of a shorter sequence to ensure consistent dimensions
    min_len = min(len(token_ids1), len(token_ids2))
    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
    # Calculate similarity using torch.cosine_similarity
    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
    print(f"Token IDs cosine similarity: {similarity.item()}")
    assert similarity > 0.95