From 9a79cbaecbe351094dc894398d3852f05efc6dd8 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Tue, 23 Dec 2025 10:46:54 +0800
Subject: [PATCH] [ModelRunner] Add hunyuan-vl basic support (#5151)

### What this PR does / why we need it?
This patch add handling of `XDRotaryEmbedding` in modelrunner to support
for `hunyuan-vl`
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
CI passed with added/exist tests

Closes: https://github.com/vllm-project/vllm-ascend/issues/4992

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 tests/e2e/conftest.py                 | 31 +++++++++++++++++++++-----
 tests/e2e/singlecard/test_vlm.py      | 32 +++++++++++++++------------
 vllm_ascend/worker/model_runner_v1.py | 25 ++++++++++++++++-----
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index b59ab7ce..12f5101e 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -763,11 +763,32 @@ def qwen_prompt(questions: list[str]) -> list[str]:
              f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
 
 
-PROMPT_TEMPLATES = {
-    "qwen2.5vl": qwen_prompt,
+def hunyuan_prompt(questions: list[str]) -> list[str]:
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    return [
+        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+        for question in questions
+    ]
+
+
+PROMPT_CONFIGS = {
+    "qwen-vl": {
+        "model": "Qwen/Qwen3-VL-8B-Instruct",
+        "prompt_fn": qwen_prompt,
+        "mm_processor_kwargs": {
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+    },
+    "hunyuan-vl": {
+        "model": "Tencent-Hunyuan/HunyuanOCR",
+        "prompt_fn": hunyuan_prompt,
+        "mm_processor_kwargs": {},
+    },
 }
 
 
-@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
-def prompt_template(request):
-    return PROMPT_TEMPLATES[request.param]
+@pytest.fixture(params=PROMPT_CONFIGS.keys())
+def vl_config(request):
+    return PROMPT_CONFIGS[request.param]
diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py
index 4cdfd7c2..88ad378a 100644
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -27,28 +27,32 @@ from vllm.assets.image import ImageAsset
 from tests.e2e.conftest import VllmRunner
 
 
-def test_multimodal_vl(prompt_template):
-    image = ImageAsset("cherry_blossom") \
-        .pil_image.convert("RGB")
+def test_multimodal_vl(vl_config):
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
     img_questions = [
         "What is the content of this image?",
         "Describe the content of this image in detail.",
         "What's in the image?",
         "Where is this image taken?",
     ]
+
     images = [image] * len(img_questions)
-    prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
-                    mm_processor_kwargs={
-                        "min_pixels": 28 * 28,
-                        "max_pixels": 1280 * 28 * 28,
-                        "fps": 1,
-                    },
-                    enforce_eager=False) as vllm_model:
-        outputs = vllm_model.generate_greedy(prompts=prompts,
-                                             images=images,
-                                             max_tokens=64)
+    prompts = vl_config["prompt_fn"](img_questions)
+
+    with VllmRunner(vl_config["model"],
+                    mm_processor_kwargs=vl_config["mm_processor_kwargs"],
+                    enforce_eager=False,
+                    max_model_len=8192,
+                    limit_mm_per_prompt={"image": 1}) as vllm_model:
+        outputs = vllm_model.generate_greedy(
+            prompts=prompts,
+            images=images,
+            max_tokens=64,
+        )
+
         assert len(outputs) == len(prompts)
+
         for _, output_str in outputs:
             assert output_str, "Generated output should not be empty."
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index d24584e7..a72cdeae 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -654,15 +654,23 @@ class NPUModelRunner(GPUModelRunner):
         else:
             self.positions.np[:total_num_scheduled_tokens] = positions_np
 
-        # Calculate M-RoPE positions.
-        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
-            self._calc_mrope_positions(scheduler_output)
-
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            self._calc_mrope_positions(scheduler_output)
             self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
                 self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
-                non_blocking=True)
+                non_blocking=True,
+            )
+        elif self.uses_xdrope_dim > 0:
+            self._calc_xdrope_positions(scheduler_output)
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True,
+            )
+        else:
+            # Common case (1D positions)
+            self.positions.copy_to_gpu(total_num_scheduled_tokens)
 
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -845,9 +853,12 @@ class NPUModelRunner(GPUModelRunner):
             # then the embedding layer is not included in the ACL graph.
             input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
-        positions = self.positions.gpu[:num_input_tokens]
         if self.uses_mrope:
             positions = self.mrope_positions.gpu[:, :num_input_tokens]
+        elif self.uses_xdrope_dim > 0:
+            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
+        else:
+            positions = self.positions.gpu[:num_input_tokens]
 
         # type: ignore
         if get_pp_group().is_first_rank:
@@ -2070,6 +2081,8 @@ class NPUModelRunner(GPUModelRunner):
 
             if self.uses_mrope:
                 positions = self.mrope_positions.gpu[:, :num_tokens_padded]
+            elif self.uses_xdrope_dim > 0:
+                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
             else:
                 positions = self.positions.gpu[:num_tokens_padded]