From 9a79cbaecbe351094dc894398d3852f05efc6dd8 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 23 Dec 2025 10:46:54 +0800 Subject: [PATCH] [ModelRunner] Add hunyuan-vl basic support (#5151) ### What this PR does / why we need it? This patch add handling of `XDRotaryEmbedding` in modelrunner to support for `hunyuan-vl` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with added/exist tests Closes: https://github.com/vllm-project/vllm-ascend/issues/4992 - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: wangli --- tests/e2e/conftest.py | 31 +++++++++++++++++++++----- tests/e2e/singlecard/test_vlm.py | 32 +++++++++++++++------------ vllm_ascend/worker/model_runner_v1.py | 25 ++++++++++++++++----- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index b59ab7ce..12f5101e 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -763,11 +763,32 @@ def qwen_prompt(questions: list[str]) -> list[str]: f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions] -PROMPT_TEMPLATES = { - "qwen2.5vl": qwen_prompt, +def hunyuan_prompt(questions: list[str]) -> list[str]: + placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501 + return [ + f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" + for question in questions + ] + + +PROMPT_CONFIGS = { + "qwen-vl": { + "model": "Qwen/Qwen3-VL-8B-Instruct", + "prompt_fn": qwen_prompt, + "mm_processor_kwargs": { + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + }, + "hunyuan-vl": { + "model": "Tencent-Hunyuan/HunyuanOCR", + "prompt_fn": hunyuan_prompt, + "mm_processor_kwargs": {}, + }, } -@pytest.fixture(params=list(PROMPT_TEMPLATES.keys())) -def prompt_template(request): - return PROMPT_TEMPLATES[request.param] +@pytest.fixture(params=PROMPT_CONFIGS.keys()) +def vl_config(request): + return PROMPT_CONFIGS[request.param] diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 4cdfd7c2..88ad378a 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -27,28 +27,32 @@ from vllm.assets.image import ImageAsset from tests.e2e.conftest import VllmRunner -def test_multimodal_vl(prompt_template): - image = ImageAsset("cherry_blossom") \ - .pil_image.convert("RGB") +def test_multimodal_vl(vl_config): + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_questions = [ "What is the content of this image?", "Describe the content of this image in detail.", "What's in the image?", "Where is this image taken?", ] + images = [image] * len(img_questions) - prompts = prompt_template(img_questions) - with VllmRunner("Qwen/Qwen3-VL-8B-Instruct", - mm_processor_kwargs={ - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - "fps": 1, - }, - enforce_eager=False) as vllm_model: - outputs = vllm_model.generate_greedy(prompts=prompts, - images=images, - max_tokens=64) + prompts = vl_config["prompt_fn"](img_questions) + + with VllmRunner(vl_config["model"], + mm_processor_kwargs=vl_config["mm_processor_kwargs"], + enforce_eager=False, + max_model_len=8192, + limit_mm_per_prompt={"image": 1}) as vllm_model: + outputs = vllm_model.generate_greedy( + prompts=prompts, + images=images, + max_tokens=64, + ) + assert len(outputs) == len(prompts) + for _, output_str in outputs: assert output_str, "Generated output should not be empty." diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d24584e7..a72cdeae 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -654,15 +654,23 @@ class NPUModelRunner(GPUModelRunner): else: self.positions.np[:total_num_scheduled_tokens] = positions_np - # Calculate M-RoPE positions. - # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: - self._calc_mrope_positions(scheduler_output) - # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self._calc_mrope_positions(scheduler_output) self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( self.mrope_positions.cpu[:, :total_num_scheduled_tokens], - non_blocking=True) + non_blocking=True, + ) + elif self.uses_xdrope_dim > 0: + self._calc_xdrope_positions(scheduler_output) + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) + self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( + self.xdrope_positions.cpu[:, :total_num_scheduled_tokens], + non_blocking=True, + ) + else: + # Common case (1D positions) + self.positions.copy_to_gpu(total_num_scheduled_tokens) # Get token indices. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] @@ -845,9 +853,12 @@ class NPUModelRunner(GPUModelRunner): # then the embedding layer is not included in the ACL graph. input_ids = self.input_ids.gpu[:num_input_tokens] inputs_embeds = None - positions = self.positions.gpu[:num_input_tokens] if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_input_tokens] + elif self.uses_xdrope_dim > 0: + positions = self.xdrope_positions.gpu[:, :num_input_tokens] + else: + positions = self.positions.gpu[:num_input_tokens] # type: ignore if get_pp_group().is_first_rank: @@ -2070,6 +2081,8 @@ class NPUModelRunner(GPUModelRunner): if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_tokens_padded] + elif self.uses_xdrope_dim > 0: + positions = self.xdrope_positions.gpu[:, :num_tokens_padded] else: positions = self.positions.gpu[:num_tokens_padded]