[ModelRunner] Add hunyuan-vl basic support (#5151)

### What this PR does / why we need it? This patch add handling of `XDRotaryEmbedding` in modelrunner to support for `hunyuan-vl` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with added/exist tests Closes: https://github.com/vllm-project/vllm-ascend/issues/4992 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-23 10:46:54 +08:00
parent c9b5881bcd
commit 9a79cbaecb
3 changed files with 63 additions and 25 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -763,11 +763,32 @@ def qwen_prompt(questions: list[str]) -> list[str]:
             f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
-PROMPT_TEMPLATES = {
+def hunyuan_prompt(questions: list[str]) -> list[str]:
-    "qwen2.5vl": qwen_prompt,
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
    return [
        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
        for question in questions
    ]
 PROMPT_CONFIGS = {
    "qwen-vl": {
        "model": "Qwen/Qwen3-VL-8B-Instruct",
        "prompt_fn": qwen_prompt,
        "mm_processor_kwargs": {
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
    },
    "hunyuan-vl": {
        "model": "Tencent-Hunyuan/HunyuanOCR",
        "prompt_fn": hunyuan_prompt,
        "mm_processor_kwargs": {},
    },
 }
-@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
+@pytest.fixture(params=PROMPT_CONFIGS.keys())
-def prompt_template(request):
+def vl_config(request):
-    return PROMPT_TEMPLATES[request.param]
+    return PROMPT_CONFIGS[request.param]
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -27,28 +27,32 @@ from vllm.assets.image import ImageAsset
 from tests.e2e.conftest import VllmRunner
-def test_multimodal_vl(prompt_template):
+def test_multimodal_vl(vl_config):
-    image = ImageAsset("cherry_blossom") \
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        .pil_image.convert("RGB")
+
    img_questions = [
        "What is the content of this image?",
        "Describe the content of this image in detail.",
        "What's in the image?",
        "Where is this image taken?",
    ]
    images = [image] * len(img_questions)
-    prompts = prompt_template(img_questions)
+    prompts = vl_config["prompt_fn"](img_questions)
-    with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
+
-                    mm_processor_kwargs={
+    with VllmRunner(vl_config["model"],
-                        "min_pixels": 28 * 28,
+                    mm_processor_kwargs=vl_config["mm_processor_kwargs"],
-                        "max_pixels": 1280 * 28 * 28,
+                    enforce_eager=False,
-                        "fps": 1,
+                    max_model_len=8192,
-                    },
+                    limit_mm_per_prompt={"image": 1}) as vllm_model:
-                    enforce_eager=False) as vllm_model:
+        outputs = vllm_model.generate_greedy(
-        outputs = vllm_model.generate_greedy(prompts=prompts,
+            prompts=prompts,
            images=images,
-                                             max_tokens=64)
+            max_tokens=64,
        )
        assert len(outputs) == len(prompts)
        for _, output_str in outputs:
            assert output_str, "Generated output should not be empty."
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -654,15 +654,23 @@ class NPUModelRunner(GPUModelRunner):
        else:
            self.positions.np[:total_num_scheduled_tokens] = positions_np
        # Calculate M-RoPE positions.
        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
        if self.uses_mrope:
            self._calc_mrope_positions(scheduler_output)
            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
            self._calc_mrope_positions(scheduler_output)
            self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
                self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
-                non_blocking=True)
+                non_blocking=True,
            )
        elif self.uses_xdrope_dim > 0:
            self._calc_xdrope_positions(scheduler_output)
            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
                non_blocking=True,
            )
        else:
            # Common case (1D positions)
            self.positions.copy_to_gpu(total_num_scheduled_tokens)
        # Get token indices.
        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -845,9 +853,12 @@ class NPUModelRunner(GPUModelRunner):
            # then the embedding layer is not included in the ACL graph.
            input_ids = self.input_ids.gpu[:num_input_tokens]
            inputs_embeds = None
        positions = self.positions.gpu[:num_input_tokens]
        if self.uses_mrope:
            positions = self.mrope_positions.gpu[:, :num_input_tokens]
        elif self.uses_xdrope_dim > 0:
            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
        else:
            positions = self.positions.gpu[:num_input_tokens]
        # type: ignore
        if get_pp_group().is_first_rank:
@@ -2070,6 +2081,8 @@ class NPUModelRunner(GPUModelRunner):
            if self.uses_mrope:
                positions = self.mrope_positions.gpu[:, :num_tokens_padded]
            elif self.uses_xdrope_dim > 0:
                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
            else:
                positions = self.positions.gpu[:num_tokens_padded]