diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 0fd298fd..002bc484 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -75,7 +75,7 @@ jobs: PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 if: ${{ inputs.type == 'light' }} run: | - # pytest -sv tests/e2e/singlecard/test_aclgraph.py + # pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py # pytest -sv tests/e2e/singlecard/test_quantization.py pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness @@ -91,10 +91,9 @@ jobs: # the test separately. pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py - pytest -sv tests/e2e/singlecard/test_aclgraph.py + pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py pytest -sv tests/e2e/singlecard/test_camem.py - # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py pytest -sv tests/e2e/singlecard/test_guided_decoding.py # torch 2.8 doesn't work with lora, fix me #pytest -sv tests/e2e/singlecard/test_ilama_lora.py @@ -102,7 +101,6 @@ jobs: pytest -sv tests/e2e/singlecard/test_quantization.py pytest -sv tests/e2e/singlecard/test_sampler.py pytest -sv tests/e2e/singlecard/test_vlm.py - pytest -sv tests/e2e/singlecard/multi-modal/test_internvl.py pytest -sv tests/e2e/singlecard/test_xlite.py pytest -sv tests/e2e/singlecard/pooling/ diff --git a/docs/source/developer_guide/contribution/testing.md b/docs/source/developer_guide/contribution/testing.md index df710af3..b4dea166 100644 --- a/docs/source/developer_guide/contribution/testing.md +++ b/docs/source/developer_guide/contribution/testing.md @@ -252,7 +252,7 @@ Run nightly multi-node test cases locally refer to section of `Running Locally` - Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py) - Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py) -- Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py) +- Correctness test example: [`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py) The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model: 1. Fork the original model repo in modelscope. All the files in the repo except for weights are required. diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po index 8a9ca91a..7f581029 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po @@ -169,11 +169,11 @@ msgstr "" #: ../../developer_guide/contribution/testing.md:246 msgid "" "Correctness test example: " -"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-" -"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)" +"[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)" msgstr "" -"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-" -"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)" +"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)" #: ../../developer_guide/contribution/testing.md:247 msgid "" diff --git a/tests/e2e/models/configs/InternVL2-8B.yaml b/tests/e2e/models/configs/InternVL2-8B.yaml new file mode 100644 index 00000000..bf705365 --- /dev/null +++ b/tests/e2e/models/configs/InternVL2-8B.yaml @@ -0,0 +1,11 @@ +model_name: "OpenGVLab/InternVL2-8B" +runner: "linux-aarch64-a2-1" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: + - name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.58 +max_model_len: 32768 +trust_remote_code: True diff --git a/tests/e2e/models/configs/InternVL2_5-8B.yaml b/tests/e2e/models/configs/InternVL2_5-8B.yaml new file mode 100644 index 00000000..d8c1fafe --- /dev/null +++ b/tests/e2e/models/configs/InternVL2_5-8B.yaml @@ -0,0 +1,11 @@ +model_name: "OpenGVLab/InternVL2_5-8B" +runner: "linux-aarch64-a2-1" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: + - name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.58 +max_model_len: 32768 +trust_remote_code: True diff --git a/tests/e2e/models/configs/InternVL3-8B.yaml b/tests/e2e/models/configs/InternVL3-8B.yaml new file mode 100644 index 00000000..d07dc6f9 --- /dev/null +++ b/tests/e2e/models/configs/InternVL3-8B.yaml @@ -0,0 +1,11 @@ +model_name: "OpenGVLab/InternVL3-8B" +runner: "linux-aarch64-a2-1" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: + - name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.58 +max_model_len: 32768 +trust_remote_code: True diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt index b4ab5419..b5f7aeed 100644 --- a/tests/e2e/models/configs/accuracy.txt +++ b/tests/e2e/models/configs/accuracy.txt @@ -6,6 +6,9 @@ Qwen3-VL-30B-A3B-Instruct.yaml Qwen3-VL-8B-Instruct.yaml Qwen2.5-Omni-7B.yaml Meta-Llama-3.1-8B-Instruct.yaml +InternVL2-8B.yaml +InternVL2_5-8B.yaml +InternVL3-8B.yaml InternVL3_5-8B.yaml ERNIE-4.5-21B-A3B-PT.yaml gemma-3-4b-it.yaml diff --git a/tests/e2e/multicard/test_weight_loader.py b/tests/e2e/multicard/test_weight_loader.py deleted file mode 100644 index 6bb616df..00000000 --- a/tests/e2e/multicard/test_weight_loader.py +++ /dev/null @@ -1,109 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Compare the outputs of vLLM with and without aclgraph. - -Run `pytest tests/multicard/test_external_launcher.py`. -""" - -import os -import subprocess -import sys - -import pytest -import torch_npu - -MOE_MODELS = ["Qwen/Qwen3-30B-A3B"] -MODELS = ["Qwen/Qwen3-8B"] -DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] - - -@pytest.mark.parametrize("model", MOE_MODELS) -def test_external_launcher(model): - script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), - "--model", - model, - "--tp-size", - "2", - "--proc-per-node", - "2", - "--trust-remote-code", - "--enable-expert-parallel", - "--enable-sleep-mode", - "--model-weight-gib", - "20", - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=600, - ) - output = proc.stdout.decode(errors='ignore') - - print(output) - - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output - assert "Generated text:" in output - assert proc.returncode == 0 - - -@pytest.mark.parametrize("model", MODELS) -def test_external_launcher_dense(model): - script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), - "--model", - model, - "--tp-size", - "2", - "--proc-per-node", - "2", - "--trust-remote-code", - "--enable-sleep-mode", - "--model-weight-gib", - "20", - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=600, - ) - output = proc.stdout.decode(errors='ignore') - - print(output) - - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output - assert "Generated text:" in output - assert proc.returncode == 0 diff --git a/tests/e2e/singlecard/multi-modal/test_internvl.py b/tests/e2e/singlecard/multi-modal/test_internvl.py deleted file mode 100644 index ac60a75c..00000000 --- a/tests/e2e/singlecard/multi-modal/test_internvl.py +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import os - -# Set spawn method before any torch/NPU imports to avoid fork issues -os.environ.setdefault('VLLM_WORKER_MULTIPROC_METHOD', 'spawn') - -import pytest -from vllm.assets.image import ImageAsset - -from tests.e2e.conftest import VllmRunner -from tests.e2e.model_utils import check_outputs_equal - -MODELS = [ - "OpenGVLab/InternVL2-8B", - "OpenGVLab/InternVL2_5-8B", - "OpenGVLab/InternVL3-8B", - "OpenGVLab/InternVL3_5-8B", -] - - -@pytest.mark.parametrize("model", MODELS) -def test_internvl_basic(model: str): - """Test basic InternVL2 inference with single image.""" - # Load test image - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - - # InternVL uses chat template format - # Format: <|im_start|>user\n\nQUESTION<|im_end|>\n<|im_start|>assistant\n - questions = [ - "What is the content of this image?", - "Describe this image in detail.", - ] - - # Build prompts with InternVL2 chat template - prompts = [ - f"<|im_start|>user\n\n{q}<|im_end|>\n<|im_start|>assistant\n" - for q in questions - ] - images = [image] * len(prompts) - - outputs = {} - for enforce_eager, mode in [(False, "eager"), (True, "graph")]: - with VllmRunner( - model, - max_model_len=8192, - limit_mm_per_prompt={"image": 4}, - enforce_eager=enforce_eager, - dtype="bfloat16", - ) as vllm_model: - generated_outputs = vllm_model.generate_greedy( - prompts=prompts, - images=images, - max_tokens=128, - ) - - assert len(generated_outputs) == len(prompts), \ - f"Expected {len(prompts)} outputs, got {len(generated_outputs)} in {mode} mode" - - for i, (_, output_str) in enumerate(generated_outputs): - assert output_str, \ - f"{mode.capitalize()} mode output {i} should not be empty. Prompt: {prompts[i]}" - assert len(output_str.strip()) > 0, \ - f"{mode.capitalize()} mode Output {i} should have meaningful content" - - outputs[mode] = generated_outputs - - eager_outputs = outputs["eager"] - graph_outputs = outputs["graph"] - - check_outputs_equal(outputs_0_lst=eager_outputs, - outputs_1_lst=graph_outputs, - name_0="eager mode", - name_1="graph mode") diff --git a/tests/e2e/singlecard/pooling/test_embedding.py b/tests/e2e/singlecard/pooling/test_embedding.py index 7666dbcd..a564dfbb 100644 --- a/tests/e2e/singlecard/pooling/test_embedding.py +++ b/tests/e2e/singlecard/pooling/test_embedding.py @@ -24,7 +24,6 @@ from tests.e2e.utils import check_embeddings_close MODELS = [ "Qwen/Qwen3-Embedding-0.6B", # lasttoken - "BAAI/bge-small-en-v1.5", # cls_token "intfloat/multilingual-e5-small" # mean_tokens ] @@ -57,3 +56,45 @@ def test_embed_models_correctness(model: str): name_1="vllm", tol=1e-2, ) + + +def test_bge_model_correctness(): + queries = ['What is the capital of China?', 'Explain gravity'] + + model_name = snapshot_download("BAAI/bge-m3") + with VllmRunner( + model_name, + runner="pooling", + enforce_eager=False, + ) as vllm_aclgraph_runner: + vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries) + + with VllmRunner( + model_name, + runner="pooling", + enforce_eager=True, + ) as vllm_runner: + vllm_eager_outputs = vllm_runner.embed(queries) + + with HfRunner( + model_name, + dtype="float32", + is_sentence_transformer=True, + ) as hf_runner: + hf_outputs = hf_runner.encode(queries) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_eager_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + + check_embeddings_close( + embeddings_0_lst=vllm_eager_outputs, + embeddings_1_lst=vllm_aclgraph_outputs, + name_0="eager", + name_1="aclgraph", + tol=1e-2, + ) diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py similarity index 98% rename from tests/e2e/singlecard/test_aclgraph.py rename to tests/e2e/singlecard/test_aclgraph_accuracy.py index 60cb3c16..5b03c0c4 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -17,7 +17,7 @@ """ Compare the outputs of vLLM with and without aclgraph. -Run `pytest tests/compile/test_aclgraph.py`. +Run `pytest tests/compile/test_aclgraph_accuracy.py`. """ import os @@ -36,7 +36,7 @@ MODELS = [ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) -def test_models_with_aclgraph( +def test_output_between_eager_and_aclgraph( model: str, max_tokens: int, ) -> None: @@ -100,7 +100,7 @@ def test_models_with_aclgraph( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) -def test_models_with_aclgraph_full_decode_only( +def test_output_between_eager_and_full_decode_only( model: str, max_tokens: int, ) -> None: diff --git a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py index b72dc0d0..d5fff2f2 100644 --- a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py +++ b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py @@ -25,7 +25,7 @@ from tests.e2e.conftest import VllmRunner os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] +MODELS = ["Qwen/Qwen3-0.6B"] def get_prompt_embeds(chat, tokenizer, embedding_layer): @@ -37,127 +37,6 @@ def get_prompt_embeds(chat, tokenizer, embedding_layer): return prompt_embeds -@pytest.mark.parametrize("model_name", MODELS) -def test_single_prompt_embeds_inference(model_name): - """Test single prompt inference with prompt embeddings.""" - # Prepare prompt embeddings - tokenizer = AutoTokenizer.from_pretrained(model_name) - transformers_model = AutoModelForCausalLM.from_pretrained(model_name) - embedding_layer = transformers_model.get_input_embeddings() - - chat = [{ - "role": "user", - "content": "Please tell me about the capital of France." - }] - prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) - - # Run inference with prompt embeddings - with VllmRunner( - model_name, - enable_prompt_embeds=True, - enforce_eager=True, - ) as vllm_runner: - outputs = vllm_runner.model.generate({ - "prompt_embeds": prompt_embeds, - }) - - # Verify output - assert len(outputs) == 1 - assert len(outputs[0].outputs) > 0 - assert len(outputs[0].outputs[0].text) > 0 - print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}") - - -@pytest.mark.parametrize("model_name", MODELS) -def test_batch_prompt_embeds_inference(model_name): - """Test batch prompt inference with prompt embeddings.""" - # Prepare prompt embeddings - tokenizer = AutoTokenizer.from_pretrained(model_name) - transformers_model = AutoModelForCausalLM.from_pretrained(model_name) - embedding_layer = transformers_model.get_input_embeddings() - - chats = [[{ - "role": "user", - "content": "Please tell me about the capital of France." - }], - [{ - "role": "user", - "content": "When is the day longest during the year?" - }], - [{ - "role": "user", - "content": "Where is bigger, the moon or the sun?" - }]] - - prompt_embeds_list = [ - get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats - ] - - # Run batch inference with prompt embeddings - with VllmRunner( - model_name, - enable_prompt_embeds=True, - enforce_eager=True, - ) as vllm_runner: - outputs = vllm_runner.model.generate([{ - "prompt_embeds": embeds - } for embeds in prompt_embeds_list]) - - # Verify outputs - assert len(outputs) == len(chats) - for i, output in enumerate(outputs): - assert len(output.outputs) > 0 - assert len(output.outputs[0].text) > 0 - print(f"\nQ{i+1}: {chats[i][0]['content']}") - print(f"A{i+1}: {output.outputs[0].text}") - - -@pytest.mark.parametrize("model_name", MODELS) -def test_prompt_embeds_with_aclgraph(model_name): - """Test prompt embeddings with ACL graph enabled vs disabled.""" - # Prepare prompt embeddings - tokenizer = AutoTokenizer.from_pretrained(model_name) - transformers_model = AutoModelForCausalLM.from_pretrained(model_name) - embedding_layer = transformers_model.get_input_embeddings() - - chat = [{"role": "user", "content": "What is the capital of China?"}] - prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) - - # Run with ACL graph enabled (enforce_eager=False) - with VllmRunner( - model_name, - enable_prompt_embeds=True, - enforce_eager=False, - ) as vllm_aclgraph_runner: - aclgraph_outputs = vllm_aclgraph_runner.model.generate({ - "prompt_embeds": - prompt_embeds, - }) - - # Run with ACL graph disabled (enforce_eager=True) - with VllmRunner( - model_name, - enable_prompt_embeds=True, - enforce_eager=True, - ) as vllm_eager_runner: - eager_outputs = vllm_eager_runner.model.generate({ - "prompt_embeds": - prompt_embeds, - }) - - # Verify both produce valid outputs - assert len(aclgraph_outputs) == 1 - assert len(eager_outputs) == 1 - assert len(aclgraph_outputs[0].outputs[0].text) > 0 - assert len(eager_outputs[0].outputs[0].text) > 0 - - print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text) - print("[Eager Output]:", eager_outputs[0].outputs[0].text) - - # Note: Outputs may differ slightly due to different execution paths, - # but both should be valid responses - - @pytest.mark.parametrize("model_name", MODELS) def test_mixed_prompt_embeds_and_text(model_name): """Test mixed inputs with both prompt embeddings and text prompts.""" @@ -176,7 +55,6 @@ def test_mixed_prompt_embeds_and_text(model_name): with VllmRunner( model_name, enable_prompt_embeds=True, - enforce_eager=True, ) as vllm_runner: # Test prompt embeddings embeds_output = vllm_runner.model.generate({ diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 95456679..c120ef2d 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -38,7 +38,7 @@ def test_multimodal_vl(prompt_template): ] images = [image] * len(img_questions) prompts = prompt_template(img_questions) - with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", + with VllmRunner("Qwen/Qwen3-VL-8B-Instruct", max_model_len=4096, mm_processor_kwargs={ "min_pixels": 28 * 28,