[ModelRunner] Add hunyuan-vl basic support (#5151)

### What this PR does / why we need it? This patch add handling of `XDRotaryEmbedding` in modelrunner to support for `hunyuan-vl` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with added/exist tests Closes: https://github.com/vllm-project/vllm-ascend/issues/4992 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-23 10:46:54 +08:00
parent c9b5881bcd
commit 9a79cbaecb
3 changed files with 63 additions and 25 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -763,11 +763,32 @@ def qwen_prompt(questions: list[str]) -> list[str]:
             f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]


-PROMPT_TEMPLATES = {
-    "qwen2.5vl": qwen_prompt,
+def hunyuan_prompt(questions: list[str]) -> list[str]:
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    return [
+        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+        for question in questions
+    ]
+
+
+PROMPT_CONFIGS = {
+    "qwen-vl": {
+        "model": "Qwen/Qwen3-VL-8B-Instruct",
+        "prompt_fn": qwen_prompt,
+        "mm_processor_kwargs": {
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+    },
+    "hunyuan-vl": {
+        "model": "Tencent-Hunyuan/HunyuanOCR",
+        "prompt_fn": hunyuan_prompt,
+        "mm_processor_kwargs": {},
+    },
 }


-@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
-def prompt_template(request):
-    return PROMPT_TEMPLATES[request.param]
+@pytest.fixture(params=PROMPT_CONFIGS.keys())
+def vl_config(request):
+    return PROMPT_CONFIGS[request.param]
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -27,28 +27,32 @@ from vllm.assets.image import ImageAsset
 from tests.e2e.conftest import VllmRunner


-def test_multimodal_vl(prompt_template):
-    image = ImageAsset("cherry_blossom") \
-        .pil_image.convert("RGB")
+def test_multimodal_vl(vl_config):
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
    img_questions = [
        "What is the content of this image?",
        "Describe the content of this image in detail.",
        "What's in the image?",
        "Where is this image taken?",
    ]
+
    images = [image] * len(img_questions)
-    prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
-                    mm_processor_kwargs={
-                        "min_pixels": 28 * 28,
-                        "max_pixels": 1280 * 28 * 28,
-                        "fps": 1,
-                    },
-                    enforce_eager=False) as vllm_model:
-        outputs = vllm_model.generate_greedy(prompts=prompts,
-                                             images=images,
-                                             max_tokens=64)
+    prompts = vl_config["prompt_fn"](img_questions)
+
+    with VllmRunner(vl_config["model"],
+                    mm_processor_kwargs=vl_config["mm_processor_kwargs"],
+                    enforce_eager=False,
+                    max_model_len=8192,
+                    limit_mm_per_prompt={"image": 1}) as vllm_model:
+        outputs = vllm_model.generate_greedy(
+            prompts=prompts,
+            images=images,
+            max_tokens=64,
+        )
+
        assert len(outputs) == len(prompts)
+
        for _, output_str in outputs:
            assert output_str, "Generated output should not be empty."

--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -654,15 +654,23 @@ class NPUModelRunner(GPUModelRunner):
        else:
            self.positions.np[:total_num_scheduled_tokens] = positions_np

-        # Calculate M-RoPE positions.
-        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
        if self.uses_mrope:
-            self._calc_mrope_positions(scheduler_output)
-
            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            self._calc_mrope_positions(scheduler_output)
            self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
                self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
-                non_blocking=True)
+                non_blocking=True,
+            )
+        elif self.uses_xdrope_dim > 0:
+            self._calc_xdrope_positions(scheduler_output)
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True,
+            )
+        else:
+            # Common case (1D positions)
+            self.positions.copy_to_gpu(total_num_scheduled_tokens)

        # Get token indices.
        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -845,9 +853,12 @@ class NPUModelRunner(GPUModelRunner):
            # then the embedding layer is not included in the ACL graph.
            input_ids = self.input_ids.gpu[:num_input_tokens]
            inputs_embeds = None
-        positions = self.positions.gpu[:num_input_tokens]
        if self.uses_mrope:
            positions = self.mrope_positions.gpu[:, :num_input_tokens]
+        elif self.uses_xdrope_dim > 0:
+            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
+        else:
+            positions = self.positions.gpu[:num_input_tokens]

        # type: ignore
        if get_pp_group().is_first_rank:
@@ -2070,6 +2081,8 @@ class NPUModelRunner(GPUModelRunner):

            if self.uses_mrope:
                positions = self.mrope_positions.gpu[:, :num_tokens_padded]
+            elif self.uses_xdrope_dim > 0:
+                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
            else:
                positions = self.positions.gpu[:num_tokens_padded]