[E2E] Refactor the e2e testcases. (#4789)

### What this PR does / why we need it? Refactor the e2e testcases. - tests/e2e/multicard/test_weight_loader.py: Remove the unused code. - tests/e2e/singlecard/multi-modal/test_internvl.py: Move to accuracy test. - tests/e2e/singlecard/test_aclgraph.py: Rename the file. - tests/e2e/singlecard/test_embedding_aclgraph.py : Combine with tests/e2e/singlecard/test_bge_model.py - tests/e2e/singlecard/test_completion_with_prompt_embeds.py: Delete eager mode and modify model to Qwen3-0.6B - tests/e2e/singlecard/test_quantization.py: Modify model to Qwen3-0.6B-W8A8 - tests/e2e/singlecard/test_vlm.py: Modify model to Qwen3-VL-8B - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: menogrey <1299267905@qq.com>
2025-12-11 10:15:00 +08:00
parent 11bebb518c
commit 66b0781840
13 changed files with 90 additions and 335 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -75,7 +75,7 @@ jobs:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
        if: ${{ inputs.type == 'light' }}
        run: |
-          # pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          # pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
          # pytest -sv tests/e2e/singlecard/test_quantization.py
          pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
          pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
@@ -91,10 +91,9 @@ jobs:
          # the test separately.
          pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
-          pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
          pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
          pytest -sv tests/e2e/singlecard/test_camem.py
          # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
          # torch 2.8 doesn't work with lora, fix me
          #pytest -sv tests/e2e/singlecard/test_ilama_lora.py
@@ -102,7 +101,6 @@ jobs:
          pytest -sv tests/e2e/singlecard/test_quantization.py
          pytest -sv tests/e2e/singlecard/test_sampler.py
          pytest -sv tests/e2e/singlecard/test_vlm.py
          pytest -sv tests/e2e/singlecard/multi-modal/test_internvl.py
          pytest -sv tests/e2e/singlecard/test_xlite.py
          pytest -sv tests/e2e/singlecard/pooling/
--- a/docs/source/developer_guide/contribution/testing.md
+++ b/docs/source/developer_guide/contribution/testing.md
@@ -252,7 +252,7 @@ Run nightly multi-node test cases locally refer to section of `Running Locally`
 - Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)
 - Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)
- Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)
+- Correctness test example: [`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)
    The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model:
    1. Fork the original model repo in modelscope. All the files in the repo except for weights are required.
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
@@ -169,11 +169,11 @@ msgstr ""
 #: ../../developer_guide/contribution/testing.md:246
 msgid ""
 "Correctness test example: "
-"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
+"[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
-"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
+"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
 msgstr ""
-"正确性测试示例：[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
+"正确性测试示例：[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
-"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
+"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
 #: ../../developer_guide/contribution/testing.md:247
 msgid ""
--- a/tests/e2e/models/configs/InternVL2-8B.yaml
+++ b/tests/e2e/models/configs/InternVL2-8B.yaml
@@ -0,0 +1,11 @@
 model_name: "OpenGVLab/InternVL2-8B"
 runner: "linux-aarch64-a2-1"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
  - name: "mmmu_val"
    metrics:
    - name: "acc,none"
      value: 0.58
 max_model_len: 32768
 trust_remote_code: True
--- a/tests/e2e/models/configs/InternVL2_5-8B.yaml
+++ b/tests/e2e/models/configs/InternVL2_5-8B.yaml
@@ -0,0 +1,11 @@
 model_name: "OpenGVLab/InternVL2_5-8B"
 runner: "linux-aarch64-a2-1"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
  - name: "mmmu_val"
    metrics:
    - name: "acc,none"
      value: 0.58
 max_model_len: 32768
 trust_remote_code: True
--- a/tests/e2e/models/configs/InternVL3-8B.yaml
+++ b/tests/e2e/models/configs/InternVL3-8B.yaml
@@ -0,0 +1,11 @@
 model_name: "OpenGVLab/InternVL3-8B"
 runner: "linux-aarch64-a2-1"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
  - name: "mmmu_val"
    metrics:
    - name: "acc,none"
      value: 0.58
 max_model_len: 32768
 trust_remote_code: True
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -6,6 +6,9 @@ Qwen3-VL-30B-A3B-Instruct.yaml
 Qwen3-VL-8B-Instruct.yaml
 Qwen2.5-Omni-7B.yaml
 Meta-Llama-3.1-8B-Instruct.yaml
 InternVL2-8B.yaml
 InternVL2_5-8B.yaml
 InternVL3-8B.yaml
 InternVL3_5-8B.yaml
 ERNIE-4.5-21B-A3B-PT.yaml
 gemma-3-4b-it.yaml
--- a/tests/e2e/multicard/test_weight_loader.py
+++ b/tests/e2e/multicard/test_weight_loader.py
@@ -1,109 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 """
 Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/multicard/test_external_launcher.py`.
 """
 import os
 import subprocess
 import sys
 import pytest
 import torch_npu
 MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
 MODELS = ["Qwen/Qwen3-8B"]
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MOE_MODELS)
 def test_external_launcher(model):
    script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
    env = os.environ.copy()
    # TODO: Change to 2 when ci machine has 4 cards
    cmd = [
        sys.executable,
        str(script),
        "--model",
        model,
        "--tp-size",
        "2",
        "--proc-per-node",
        "2",
        "--trust-remote-code",
        "--enable-expert-parallel",
        "--enable-sleep-mode",
        "--model-weight-gib",
        "20",
    ]
    print(f"Running subprocess: {' '.join(cmd)}")
    proc = subprocess.run(
        cmd,
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        timeout=600,
    )
    output = proc.stdout.decode(errors='ignore')
    print(output)
    assert "TP RANKS: [0]" in output
    assert "TP RANKS: [1]" in output
    assert "Generated text:" in output
    assert proc.returncode == 0
@pytest.mark.parametrize("model", MODELS)
 def test_external_launcher_dense(model):
    script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
    env = os.environ.copy()
    # TODO: Change to 2 when ci machine has 4 cards
    cmd = [
        sys.executable,
        str(script),
        "--model",
        model,
        "--tp-size",
        "2",
        "--proc-per-node",
        "2",
        "--trust-remote-code",
        "--enable-sleep-mode",
        "--model-weight-gib",
        "20",
    ]
    print(f"Running subprocess: {' '.join(cmd)}")
    proc = subprocess.run(
        cmd,
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        timeout=600,
    )
    output = proc.stdout.decode(errors='ignore')
    print(output)
    assert "TP RANKS: [0]" in output
    assert "TP RANKS: [1]" in output
    assert "Generated text:" in output
    assert proc.returncode == 0
--- a/tests/e2e/singlecard/multi-modal/test_internvl.py
+++ b/tests/e2e/singlecard/multi-modal/test_internvl.py
@@ -1,89 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 import os
 # Set spawn method before any torch/NPU imports to avoid fork issues
 os.environ.setdefault('VLLM_WORKER_MULTIPROC_METHOD', 'spawn')
 import pytest
 from vllm.assets.image import ImageAsset
 from tests.e2e.conftest import VllmRunner
 from tests.e2e.model_utils import check_outputs_equal
 MODELS = [
    "OpenGVLab/InternVL2-8B",
    "OpenGVLab/InternVL2_5-8B",
    "OpenGVLab/InternVL3-8B",
    "OpenGVLab/InternVL3_5-8B",
 ]
@pytest.mark.parametrize("model", MODELS)
 def test_internvl_basic(model: str):
    """Test basic InternVL2 inference with single image."""
    # Load test image
    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
    # InternVL uses chat template format
    # Format: <|im_start|>user\n<image>\nQUESTION<|im_end|>\n<|im_start|>assistant\n
    questions = [
        "What is the content of this image?",
        "Describe this image in detail.",
    ]
    # Build prompts with InternVL2 chat template
    prompts = [
        f"<|im_start|>user\n<image>\n{q}<|im_end|>\n<|im_start|>assistant\n"
        for q in questions
    ]
    images = [image] * len(prompts)
    outputs = {}
    for enforce_eager, mode in [(False, "eager"), (True, "graph")]:
        with VllmRunner(
                model,
                max_model_len=8192,
                limit_mm_per_prompt={"image": 4},
                enforce_eager=enforce_eager,
                dtype="bfloat16",
        ) as vllm_model:
            generated_outputs = vllm_model.generate_greedy(
                prompts=prompts,
                images=images,
                max_tokens=128,
            )
            assert len(generated_outputs) == len(prompts), \
                f"Expected {len(prompts)} outputs, got {len(generated_outputs)} in {mode} mode"
            for i, (_, output_str) in enumerate(generated_outputs):
                assert output_str, \
                    f"{mode.capitalize()} mode output {i} should not be empty. Prompt: {prompts[i]}"
                assert len(output_str.strip()) > 0, \
                    f"{mode.capitalize()} mode Output {i} should have meaningful content"
            outputs[mode] = generated_outputs
    eager_outputs = outputs["eager"]
    graph_outputs = outputs["graph"]
    check_outputs_equal(outputs_0_lst=eager_outputs,
                        outputs_1_lst=graph_outputs,
                        name_0="eager mode",
                        name_1="graph mode")
--- a/tests/e2e/singlecard/pooling/test_embedding.py
+++ b/tests/e2e/singlecard/pooling/test_embedding.py
@@ -24,7 +24,6 @@ from tests.e2e.utils import check_embeddings_close
 MODELS = [
    "Qwen/Qwen3-Embedding-0.6B",  # lasttoken
    "BAAI/bge-small-en-v1.5",  # cls_token
    "intfloat/multilingual-e5-small"  # mean_tokens
 ]
@@ -57,3 +56,45 @@ def test_embed_models_correctness(model: str):
        name_1="vllm",
        tol=1e-2,
    )
 def test_bge_model_correctness():
    queries = ['What is the capital of China?', 'Explain gravity']
    model_name = snapshot_download("BAAI/bge-m3")
    with VllmRunner(
            model_name,
            runner="pooling",
            enforce_eager=False,
    ) as vllm_aclgraph_runner:
        vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)
    with VllmRunner(
            model_name,
            runner="pooling",
            enforce_eager=True,
    ) as vllm_runner:
        vllm_eager_outputs = vllm_runner.embed(queries)
    with HfRunner(
            model_name,
            dtype="float32",
            is_sentence_transformer=True,
    ) as hf_runner:
        hf_outputs = hf_runner.encode(queries)
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_eager_outputs,
        name_0="hf",
        name_1="vllm",
        tol=1e-2,
    )
    check_embeddings_close(
        embeddings_0_lst=vllm_eager_outputs,
        embeddings_1_lst=vllm_aclgraph_outputs,
        name_0="eager",
        name_1="aclgraph",
        tol=1e-2,
    )
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -17,7 +17,7 @@
 """
 Compare the outputs of vLLM with and without aclgraph.
-Run `pytest tests/compile/test_aclgraph.py`.
+Run `pytest tests/compile/test_aclgraph_accuracy.py`.
 """
 import os
@@ -36,7 +36,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
-def test_models_with_aclgraph(
+def test_output_between_eager_and_aclgraph(
    model: str,
    max_tokens: int,
 ) -> None:
@@ -100,7 +100,7 @@ def test_models_with_aclgraph(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
-def test_models_with_aclgraph_full_decode_only(
+def test_output_between_eager_and_full_decode_only(
    model: str,
    max_tokens: int,
 ) -> None:
--- a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
+++ b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
@@ -25,7 +25,7 @@ from tests.e2e.conftest import VllmRunner
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+MODELS = ["Qwen/Qwen3-0.6B"]
 def get_prompt_embeds(chat, tokenizer, embedding_layer):
@@ -37,127 +37,6 @@ def get_prompt_embeds(chat, tokenizer, embedding_layer):
    return prompt_embeds
@pytest.mark.parametrize("model_name", MODELS)
 def test_single_prompt_embeds_inference(model_name):
    """Test single prompt inference with prompt embeddings."""
    # Prepare prompt embeddings
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
    embedding_layer = transformers_model.get_input_embeddings()
    chat = [{
        "role": "user",
        "content": "Please tell me about the capital of France."
    }]
    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
    # Run inference with prompt embeddings
    with VllmRunner(
            model_name,
            enable_prompt_embeds=True,
            enforce_eager=True,
    ) as vllm_runner:
        outputs = vllm_runner.model.generate({
            "prompt_embeds": prompt_embeds,
        })
    # Verify output
    assert len(outputs) == 1
    assert len(outputs[0].outputs) > 0
    assert len(outputs[0].outputs[0].text) > 0
    print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
@pytest.mark.parametrize("model_name", MODELS)
 def test_batch_prompt_embeds_inference(model_name):
    """Test batch prompt inference with prompt embeddings."""
    # Prepare prompt embeddings
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
    embedding_layer = transformers_model.get_input_embeddings()
    chats = [[{
        "role": "user",
        "content": "Please tell me about the capital of France."
    }],
             [{
                 "role": "user",
                 "content": "When is the day longest during the year?"
             }],
             [{
                 "role": "user",
                 "content": "Where is bigger, the moon or the sun?"
             }]]
    prompt_embeds_list = [
        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
    ]
    # Run batch inference with prompt embeddings
    with VllmRunner(
            model_name,
            enable_prompt_embeds=True,
            enforce_eager=True,
    ) as vllm_runner:
        outputs = vllm_runner.model.generate([{
            "prompt_embeds": embeds
        } for embeds in prompt_embeds_list])
    # Verify outputs
    assert len(outputs) == len(chats)
    for i, output in enumerate(outputs):
        assert len(output.outputs) > 0
        assert len(output.outputs[0].text) > 0
        print(f"\nQ{i+1}: {chats[i][0]['content']}")
        print(f"A{i+1}: {output.outputs[0].text}")
@pytest.mark.parametrize("model_name", MODELS)
 def test_prompt_embeds_with_aclgraph(model_name):
    """Test prompt embeddings with ACL graph enabled vs disabled."""
    # Prepare prompt embeddings
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
    embedding_layer = transformers_model.get_input_embeddings()
    chat = [{"role": "user", "content": "What is the capital of China?"}]
    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
    # Run with ACL graph enabled (enforce_eager=False)
    with VllmRunner(
            model_name,
            enable_prompt_embeds=True,
            enforce_eager=False,
    ) as vllm_aclgraph_runner:
        aclgraph_outputs = vllm_aclgraph_runner.model.generate({
            "prompt_embeds":
            prompt_embeds,
        })
    # Run with ACL graph disabled (enforce_eager=True)
    with VllmRunner(
            model_name,
            enable_prompt_embeds=True,
            enforce_eager=True,
    ) as vllm_eager_runner:
        eager_outputs = vllm_eager_runner.model.generate({
            "prompt_embeds":
            prompt_embeds,
        })
    # Verify both produce valid outputs
    assert len(aclgraph_outputs) == 1
    assert len(eager_outputs) == 1
    assert len(aclgraph_outputs[0].outputs[0].text) > 0
    assert len(eager_outputs[0].outputs[0].text) > 0
    print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
    print("[Eager Output]:", eager_outputs[0].outputs[0].text)
    # Note: Outputs may differ slightly due to different execution paths,
    # but both should be valid responses
@pytest.mark.parametrize("model_name", MODELS)
 def test_mixed_prompt_embeds_and_text(model_name):
    """Test mixed inputs with both prompt embeddings and text prompts."""
@@ -176,7 +55,6 @@ def test_mixed_prompt_embeds_and_text(model_name):
    with VllmRunner(
            model_name,
            enable_prompt_embeds=True,
            enforce_eager=True,
    ) as vllm_runner:
        # Test prompt embeddings
        embeds_output = vllm_runner.model.generate({
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -38,7 +38,7 @@ def test_multimodal_vl(prompt_template):
    ]
    images = [image] * len(img_questions)
    prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
+    with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
                    max_model_len=4096,
                    mm_processor_kwargs={
                        "min_pixels": 28 * 28,