[ModelRunner] Add hunyuan-vl basic support (#5151)

### What this PR does / why we need it?
This patch add handling of `XDRotaryEmbedding` in modelrunner to support
for `hunyuan-vl`
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
CI passed with added/exist tests

Closes: https://github.com/vllm-project/vllm-ascend/issues/4992

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-12-23 10:46:54 +08:00
committed by GitHub
parent c9b5881bcd
commit 9a79cbaecb
3 changed files with 63 additions and 25 deletions

View File

@@ -763,11 +763,32 @@ def qwen_prompt(questions: list[str]) -> list[str]:
f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions] f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
PROMPT_TEMPLATES = { def hunyuan_prompt(questions: list[str]) -> list[str]:
"qwen2.5vl": qwen_prompt, placeholder = "<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501
return [
f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>"
for question in questions
]
PROMPT_CONFIGS = {
"qwen-vl": {
"model": "Qwen/Qwen3-VL-8B-Instruct",
"prompt_fn": qwen_prompt,
"mm_processor_kwargs": {
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
},
"hunyuan-vl": {
"model": "Tencent-Hunyuan/HunyuanOCR",
"prompt_fn": hunyuan_prompt,
"mm_processor_kwargs": {},
},
} }
@pytest.fixture(params=list(PROMPT_TEMPLATES.keys())) @pytest.fixture(params=PROMPT_CONFIGS.keys())
def prompt_template(request): def vl_config(request):
return PROMPT_TEMPLATES[request.param] return PROMPT_CONFIGS[request.param]

View File

@@ -27,28 +27,32 @@ from vllm.assets.image import ImageAsset
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
def test_multimodal_vl(prompt_template): def test_multimodal_vl(vl_config):
image = ImageAsset("cherry_blossom") \ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
.pil_image.convert("RGB")
img_questions = [ img_questions = [
"What is the content of this image?", "What is the content of this image?",
"Describe the content of this image in detail.", "Describe the content of this image in detail.",
"What's in the image?", "What's in the image?",
"Where is this image taken?", "Where is this image taken?",
] ]
images = [image] * len(img_questions) images = [image] * len(img_questions)
prompts = prompt_template(img_questions) prompts = vl_config["prompt_fn"](img_questions)
with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
mm_processor_kwargs={ with VllmRunner(vl_config["model"],
"min_pixels": 28 * 28, mm_processor_kwargs=vl_config["mm_processor_kwargs"],
"max_pixels": 1280 * 28 * 28, enforce_eager=False,
"fps": 1, max_model_len=8192,
}, limit_mm_per_prompt={"image": 1}) as vllm_model:
enforce_eager=False) as vllm_model: outputs = vllm_model.generate_greedy(
outputs = vllm_model.generate_greedy(prompts=prompts, prompts=prompts,
images=images, images=images,
max_tokens=64) max_tokens=64,
)
assert len(outputs) == len(prompts) assert len(outputs) == len(prompts)
for _, output_str in outputs: for _, output_str in outputs:
assert output_str, "Generated output should not be empty." assert output_str, "Generated output should not be empty."

View File

@@ -654,15 +654,23 @@ class NPUModelRunner(GPUModelRunner):
else: else:
self.positions.np[:total_num_scheduled_tokens] = positions_np self.positions.np[:total_num_scheduled_tokens] = positions_np
# Calculate M-RoPE positions.
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if self.uses_mrope: if self.uses_mrope:
self._calc_mrope_positions(scheduler_output)
# Only relevant for models using M-RoPE (e.g, Qwen2-VL) # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
self._calc_mrope_positions(scheduler_output)
self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
self.mrope_positions.cpu[:, :total_num_scheduled_tokens], self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
non_blocking=True) non_blocking=True,
)
elif self.uses_xdrope_dim > 0:
self._calc_xdrope_positions(scheduler_output)
# Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
non_blocking=True,
)
else:
# Common case (1D positions)
self.positions.copy_to_gpu(total_num_scheduled_tokens)
# Get token indices. # Get token indices.
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -845,9 +853,12 @@ class NPUModelRunner(GPUModelRunner):
# then the embedding layer is not included in the ACL graph. # then the embedding layer is not included in the ACL graph.
input_ids = self.input_ids.gpu[:num_input_tokens] input_ids = self.input_ids.gpu[:num_input_tokens]
inputs_embeds = None inputs_embeds = None
positions = self.positions.gpu[:num_input_tokens]
if self.uses_mrope: if self.uses_mrope:
positions = self.mrope_positions.gpu[:, :num_input_tokens] positions = self.mrope_positions.gpu[:, :num_input_tokens]
elif self.uses_xdrope_dim > 0:
positions = self.xdrope_positions.gpu[:, :num_input_tokens]
else:
positions = self.positions.gpu[:num_input_tokens]
# type: ignore # type: ignore
if get_pp_group().is_first_rank: if get_pp_group().is_first_rank:
@@ -2070,6 +2081,8 @@ class NPUModelRunner(GPUModelRunner):
if self.uses_mrope: if self.uses_mrope:
positions = self.mrope_positions.gpu[:, :num_tokens_padded] positions = self.mrope_positions.gpu[:, :num_tokens_padded]
elif self.uses_xdrope_dim > 0:
positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
else: else:
positions = self.positions.gpu[:num_tokens_padded] positions = self.positions.gpu[:num_tokens_padded]