diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 0fd298fd..002bc484 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -75,7 +75,7 @@ jobs:
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
         if: ${{ inputs.type == 'light' }}
         run: |
-          # pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          # pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
           # pytest -sv tests/e2e/singlecard/test_quantization.py
           pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
           pytest -sv tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
@@ -91,10 +91,9 @@ jobs:
           # the test separately.
 
           pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
-          pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
           pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
           pytest -sv tests/e2e/singlecard/test_camem.py
-          # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
           # torch 2.8 doesn't work with lora, fix me
           #pytest -sv tests/e2e/singlecard/test_ilama_lora.py
@@ -102,7 +101,6 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_quantization.py
           pytest -sv tests/e2e/singlecard/test_sampler.py
           pytest -sv tests/e2e/singlecard/test_vlm.py
-          pytest -sv tests/e2e/singlecard/multi-modal/test_internvl.py
           pytest -sv tests/e2e/singlecard/test_xlite.py
           pytest -sv tests/e2e/singlecard/pooling/
 
diff --git a/docs/source/developer_guide/contribution/testing.md b/docs/source/developer_guide/contribution/testing.md
index df710af3..b4dea166 100644
--- a/docs/source/developer_guide/contribution/testing.md
+++ b/docs/source/developer_guide/contribution/testing.md
@@ -252,7 +252,7 @@ Run nightly multi-node test cases locally refer to section of `Running Locally`
 
 - Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)
 - Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)
-- Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)
+- Correctness test example: [`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)
 
     The CI resource is limited, and you might need to reduce the number of layers of a model. Below is an example of how to generate a reduced layer model:
     1. Fork the original model repo in modelscope. All the files in the repo except for weights are required.
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
index 8a9ca91a..7f581029 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
@@ -169,11 +169,11 @@ msgstr ""
 #: ../../developer_guide/contribution/testing.md:246
 msgid ""
 "Correctness test example: "
-"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
-"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
+"[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
+"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
 msgstr ""
-"正确性测试示例：[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
-"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
+"正确性测试示例：[`tests/e2e/singlecard/test_aclgraph_accuracy.py`](https://github.com/vllm-"
+"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph_accuracy.py)"
 
 #: ../../developer_guide/contribution/testing.md:247
 msgid ""
diff --git a/tests/e2e/models/configs/InternVL2-8B.yaml b/tests/e2e/models/configs/InternVL2-8B.yaml
new file mode 100644
index 00000000..bf705365
--- /dev/null
+++ b/tests/e2e/models/configs/InternVL2-8B.yaml
@@ -0,0 +1,11 @@
+model_name: "OpenGVLab/InternVL2-8B"
+runner: "linux-aarch64-a2-1"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+  - name: "mmmu_val"
+    metrics:
+    - name: "acc,none"
+      value: 0.58
+max_model_len: 32768
+trust_remote_code: True
diff --git a/tests/e2e/models/configs/InternVL2_5-8B.yaml b/tests/e2e/models/configs/InternVL2_5-8B.yaml
new file mode 100644
index 00000000..d8c1fafe
--- /dev/null
+++ b/tests/e2e/models/configs/InternVL2_5-8B.yaml
@@ -0,0 +1,11 @@
+model_name: "OpenGVLab/InternVL2_5-8B"
+runner: "linux-aarch64-a2-1"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+  - name: "mmmu_val"
+    metrics:
+    - name: "acc,none"
+      value: 0.58
+max_model_len: 32768
+trust_remote_code: True
diff --git a/tests/e2e/models/configs/InternVL3-8B.yaml b/tests/e2e/models/configs/InternVL3-8B.yaml
new file mode 100644
index 00000000..d07dc6f9
--- /dev/null
+++ b/tests/e2e/models/configs/InternVL3-8B.yaml
@@ -0,0 +1,11 @@
+model_name: "OpenGVLab/InternVL3-8B"
+runner: "linux-aarch64-a2-1"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+  - name: "mmmu_val"
+    metrics:
+    - name: "acc,none"
+      value: 0.58
+max_model_len: 32768
+trust_remote_code: True
diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt
index b4ab5419..b5f7aeed 100644
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -6,6 +6,9 @@ Qwen3-VL-30B-A3B-Instruct.yaml
 Qwen3-VL-8B-Instruct.yaml
 Qwen2.5-Omni-7B.yaml
 Meta-Llama-3.1-8B-Instruct.yaml
+InternVL2-8B.yaml
+InternVL2_5-8B.yaml
+InternVL3-8B.yaml
 InternVL3_5-8B.yaml
 ERNIE-4.5-21B-A3B-PT.yaml
 gemma-3-4b-it.yaml
diff --git a/tests/e2e/multicard/test_weight_loader.py b/tests/e2e/multicard/test_weight_loader.py
deleted file mode 100644
index 6bb616df..00000000
--- a/tests/e2e/multicard/test_weight_loader.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""
-Compare the outputs of vLLM with and without aclgraph.
-
-Run `pytest tests/multicard/test_external_launcher.py`.
-"""
-
-import os
-import subprocess
-import sys
-
-import pytest
-import torch_npu
-
-MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
-MODELS = ["Qwen/Qwen3-8B"]
-DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
-
-
-@pytest.mark.parametrize("model", MOE_MODELS)
-def test_external_launcher(model):
-    script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
-    env = os.environ.copy()
-    # TODO: Change to 2 when ci machine has 4 cards
-    cmd = [
-        sys.executable,
-        str(script),
-        "--model",
-        model,
-        "--tp-size",
-        "2",
-        "--proc-per-node",
-        "2",
-        "--trust-remote-code",
-        "--enable-expert-parallel",
-        "--enable-sleep-mode",
-        "--model-weight-gib",
-        "20",
-    ]
-
-    print(f"Running subprocess: {' '.join(cmd)}")
-    proc = subprocess.run(
-        cmd,
-        env=env,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        timeout=600,
-    )
-    output = proc.stdout.decode(errors='ignore')
-
-    print(output)
-
-    assert "TP RANKS: [0]" in output
-    assert "TP RANKS: [1]" in output
-    assert "Generated text:" in output
-    assert proc.returncode == 0
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_external_launcher_dense(model):
-    script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
-    env = os.environ.copy()
-    # TODO: Change to 2 when ci machine has 4 cards
-    cmd = [
-        sys.executable,
-        str(script),
-        "--model",
-        model,
-        "--tp-size",
-        "2",
-        "--proc-per-node",
-        "2",
-        "--trust-remote-code",
-        "--enable-sleep-mode",
-        "--model-weight-gib",
-        "20",
-    ]
-
-    print(f"Running subprocess: {' '.join(cmd)}")
-    proc = subprocess.run(
-        cmd,
-        env=env,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        timeout=600,
-    )
-    output = proc.stdout.decode(errors='ignore')
-
-    print(output)
-
-    assert "TP RANKS: [0]" in output
-    assert "TP RANKS: [1]" in output
-    assert "Generated text:" in output
-    assert proc.returncode == 0
diff --git a/tests/e2e/singlecard/multi-modal/test_internvl.py b/tests/e2e/singlecard/multi-modal/test_internvl.py
deleted file mode 100644
index ac60a75c..00000000
--- a/tests/e2e/singlecard/multi-modal/test_internvl.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-import os
-
-# Set spawn method before any torch/NPU imports to avoid fork issues
-os.environ.setdefault('VLLM_WORKER_MULTIPROC_METHOD', 'spawn')
-
-import pytest
-from vllm.assets.image import ImageAsset
-
-from tests.e2e.conftest import VllmRunner
-from tests.e2e.model_utils import check_outputs_equal
-
-MODELS = [
-    "OpenGVLab/InternVL2-8B",
-    "OpenGVLab/InternVL2_5-8B",
-    "OpenGVLab/InternVL3-8B",
-    "OpenGVLab/InternVL3_5-8B",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_internvl_basic(model: str):
-    """Test basic InternVL2 inference with single image."""
-    # Load test image
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-
-    # InternVL uses chat template format
-    # Format: <|im_start|>user\n<image>\nQUESTION<|im_end|>\n<|im_start|>assistant\n
-    questions = [
-        "What is the content of this image?",
-        "Describe this image in detail.",
-    ]
-
-    # Build prompts with InternVL2 chat template
-    prompts = [
-        f"<|im_start|>user\n<image>\n{q}<|im_end|>\n<|im_start|>assistant\n"
-        for q in questions
-    ]
-    images = [image] * len(prompts)
-
-    outputs = {}
-    for enforce_eager, mode in [(False, "eager"), (True, "graph")]:
-        with VllmRunner(
-                model,
-                max_model_len=8192,
-                limit_mm_per_prompt={"image": 4},
-                enforce_eager=enforce_eager,
-                dtype="bfloat16",
-        ) as vllm_model:
-            generated_outputs = vllm_model.generate_greedy(
-                prompts=prompts,
-                images=images,
-                max_tokens=128,
-            )
-
-            assert len(generated_outputs) == len(prompts), \
-                f"Expected {len(prompts)} outputs, got {len(generated_outputs)} in {mode} mode"
-
-            for i, (_, output_str) in enumerate(generated_outputs):
-                assert output_str, \
-                    f"{mode.capitalize()} mode output {i} should not be empty. Prompt: {prompts[i]}"
-                assert len(output_str.strip()) > 0, \
-                    f"{mode.capitalize()} mode Output {i} should have meaningful content"
-
-            outputs[mode] = generated_outputs
-
-    eager_outputs = outputs["eager"]
-    graph_outputs = outputs["graph"]
-
-    check_outputs_equal(outputs_0_lst=eager_outputs,
-                        outputs_1_lst=graph_outputs,
-                        name_0="eager mode",
-                        name_1="graph mode")
diff --git a/tests/e2e/singlecard/pooling/test_embedding.py b/tests/e2e/singlecard/pooling/test_embedding.py
index 7666dbcd..a564dfbb 100644
--- a/tests/e2e/singlecard/pooling/test_embedding.py
+++ b/tests/e2e/singlecard/pooling/test_embedding.py
@@ -24,7 +24,6 @@ from tests.e2e.utils import check_embeddings_close
 
 MODELS = [
     "Qwen/Qwen3-Embedding-0.6B",  # lasttoken
-    "BAAI/bge-small-en-v1.5",  # cls_token
     "intfloat/multilingual-e5-small"  # mean_tokens
 ]
 
@@ -57,3 +56,45 @@ def test_embed_models_correctness(model: str):
         name_1="vllm",
         tol=1e-2,
     )
+
+
+def test_bge_model_correctness():
+    queries = ['What is the capital of China?', 'Explain gravity']
+
+    model_name = snapshot_download("BAAI/bge-m3")
+    with VllmRunner(
+            model_name,
+            runner="pooling",
+            enforce_eager=False,
+    ) as vllm_aclgraph_runner:
+        vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)
+
+    with VllmRunner(
+            model_name,
+            runner="pooling",
+            enforce_eager=True,
+    ) as vllm_runner:
+        vllm_eager_outputs = vllm_runner.embed(queries)
+
+    with HfRunner(
+            model_name,
+            dtype="float32",
+            is_sentence_transformer=True,
+    ) as hf_runner:
+        hf_outputs = hf_runner.encode(queries)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_eager_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+    check_embeddings_close(
+        embeddings_0_lst=vllm_eager_outputs,
+        embeddings_1_lst=vllm_aclgraph_outputs,
+        name_0="eager",
+        name_1="aclgraph",
+        tol=1e-2,
+    )
diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py
similarity index 98%
rename from tests/e2e/singlecard/test_aclgraph.py
rename to tests/e2e/singlecard/test_aclgraph_accuracy.py
index 60cb3c16..5b03c0c4 100644
--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -17,7 +17,7 @@
 """
 Compare the outputs of vLLM with and without aclgraph.
 
-Run `pytest tests/compile/test_aclgraph.py`.
+Run `pytest tests/compile/test_aclgraph_accuracy.py`.
 """
 
 import os
@@ -36,7 +36,7 @@ MODELS = [
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
-def test_models_with_aclgraph(
+def test_output_between_eager_and_aclgraph(
     model: str,
     max_tokens: int,
 ) -> None:
@@ -100,7 +100,7 @@ def test_models_with_aclgraph(
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
-def test_models_with_aclgraph_full_decode_only(
+def test_output_between_eager_and_full_decode_only(
     model: str,
     max_tokens: int,
 ) -> None:
diff --git a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
index b72dc0d0..d5fff2f2 100644
--- a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
+++ b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
@@ -25,7 +25,7 @@ from tests.e2e.conftest import VllmRunner
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+MODELS = ["Qwen/Qwen3-0.6B"]
 
 
 def get_prompt_embeds(chat, tokenizer, embedding_layer):
@@ -37,127 +37,6 @@ def get_prompt_embeds(chat, tokenizer, embedding_layer):
     return prompt_embeds
 
 
-@pytest.mark.parametrize("model_name", MODELS)
-def test_single_prompt_embeds_inference(model_name):
-    """Test single prompt inference with prompt embeddings."""
-    # Prepare prompt embeddings
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
-    embedding_layer = transformers_model.get_input_embeddings()
-
-    chat = [{
-        "role": "user",
-        "content": "Please tell me about the capital of France."
-    }]
-    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
-
-    # Run inference with prompt embeddings
-    with VllmRunner(
-            model_name,
-            enable_prompt_embeds=True,
-            enforce_eager=True,
-    ) as vllm_runner:
-        outputs = vllm_runner.model.generate({
-            "prompt_embeds": prompt_embeds,
-        })
-
-    # Verify output
-    assert len(outputs) == 1
-    assert len(outputs[0].outputs) > 0
-    assert len(outputs[0].outputs[0].text) > 0
-    print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
-
-
-@pytest.mark.parametrize("model_name", MODELS)
-def test_batch_prompt_embeds_inference(model_name):
-    """Test batch prompt inference with prompt embeddings."""
-    # Prepare prompt embeddings
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
-    embedding_layer = transformers_model.get_input_embeddings()
-
-    chats = [[{
-        "role": "user",
-        "content": "Please tell me about the capital of France."
-    }],
-             [{
-                 "role": "user",
-                 "content": "When is the day longest during the year?"
-             }],
-             [{
-                 "role": "user",
-                 "content": "Where is bigger, the moon or the sun?"
-             }]]
-
-    prompt_embeds_list = [
-        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
-    ]
-
-    # Run batch inference with prompt embeddings
-    with VllmRunner(
-            model_name,
-            enable_prompt_embeds=True,
-            enforce_eager=True,
-    ) as vllm_runner:
-        outputs = vllm_runner.model.generate([{
-            "prompt_embeds": embeds
-        } for embeds in prompt_embeds_list])
-
-    # Verify outputs
-    assert len(outputs) == len(chats)
-    for i, output in enumerate(outputs):
-        assert len(output.outputs) > 0
-        assert len(output.outputs[0].text) > 0
-        print(f"\nQ{i+1}: {chats[i][0]['content']}")
-        print(f"A{i+1}: {output.outputs[0].text}")
-
-
-@pytest.mark.parametrize("model_name", MODELS)
-def test_prompt_embeds_with_aclgraph(model_name):
-    """Test prompt embeddings with ACL graph enabled vs disabled."""
-    # Prepare prompt embeddings
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
-    embedding_layer = transformers_model.get_input_embeddings()
-
-    chat = [{"role": "user", "content": "What is the capital of China?"}]
-    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
-
-    # Run with ACL graph enabled (enforce_eager=False)
-    with VllmRunner(
-            model_name,
-            enable_prompt_embeds=True,
-            enforce_eager=False,
-    ) as vllm_aclgraph_runner:
-        aclgraph_outputs = vllm_aclgraph_runner.model.generate({
-            "prompt_embeds":
-            prompt_embeds,
-        })
-
-    # Run with ACL graph disabled (enforce_eager=True)
-    with VllmRunner(
-            model_name,
-            enable_prompt_embeds=True,
-            enforce_eager=True,
-    ) as vllm_eager_runner:
-        eager_outputs = vllm_eager_runner.model.generate({
-            "prompt_embeds":
-            prompt_embeds,
-        })
-
-    # Verify both produce valid outputs
-    assert len(aclgraph_outputs) == 1
-    assert len(eager_outputs) == 1
-    assert len(aclgraph_outputs[0].outputs[0].text) > 0
-    assert len(eager_outputs[0].outputs[0].text) > 0
-
-    print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
-    print("[Eager Output]:", eager_outputs[0].outputs[0].text)
-
-    # Note: Outputs may differ slightly due to different execution paths,
-    # but both should be valid responses
-
-
 @pytest.mark.parametrize("model_name", MODELS)
 def test_mixed_prompt_embeds_and_text(model_name):
     """Test mixed inputs with both prompt embeddings and text prompts."""
@@ -176,7 +55,6 @@ def test_mixed_prompt_embeds_and_text(model_name):
     with VllmRunner(
             model_name,
             enable_prompt_embeds=True,
-            enforce_eager=True,
     ) as vllm_runner:
         # Test prompt embeddings
         embeds_output = vllm_runner.model.generate({
diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py
index 95456679..c120ef2d 100644
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -38,7 +38,7 @@ def test_multimodal_vl(prompt_template):
     ]
     images = [image] * len(img_questions)
     prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
+    with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
                     max_model_len=4096,
                     mm_processor_kwargs={
                         "min_pixels": 28 * 28,