### What this PR does / why we need it? this PR based on [19746](https://github.com/vllm-project/vllm/issues/19746), support Prompt Embeddings for v1 engine on NPU ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ```python python examples/prompt_embed_inference.py ``` - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: jesse <szxfml@gmail.com>
198 lines
6.9 KiB
Python
198 lines
6.9 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
import os
|
|
|
|
import pytest
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
from tests.e2e.conftest import VllmRunner
|
|
|
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
|
|
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
|
|
|
|
|
|
def get_prompt_embeds(chat, tokenizer, embedding_layer):
|
|
"""Convert chat messages to prompt embeddings."""
|
|
token_ids = tokenizer.apply_chat_template(chat,
|
|
add_generation_prompt=True,
|
|
return_tensors='pt')
|
|
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
|
return prompt_embeds
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", MODELS)
|
|
def test_single_prompt_embeds_inference(model_name):
|
|
"""Test single prompt inference with prompt embeddings."""
|
|
# Prepare prompt embeddings
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
embedding_layer = transformers_model.get_input_embeddings()
|
|
|
|
chat = [{
|
|
"role": "user",
|
|
"content": "Please tell me about the capital of France."
|
|
}]
|
|
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
|
|
|
# Run inference with prompt embeddings
|
|
with VllmRunner(
|
|
model_name,
|
|
enable_prompt_embeds=True,
|
|
enforce_eager=True,
|
|
) as vllm_runner:
|
|
outputs = vllm_runner.model.generate({
|
|
"prompt_embeds": prompt_embeds,
|
|
})
|
|
|
|
# Verify output
|
|
assert len(outputs) == 1
|
|
assert len(outputs[0].outputs) > 0
|
|
assert len(outputs[0].outputs[0].text) > 0
|
|
print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", MODELS)
|
|
def test_batch_prompt_embeds_inference(model_name):
|
|
"""Test batch prompt inference with prompt embeddings."""
|
|
# Prepare prompt embeddings
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
embedding_layer = transformers_model.get_input_embeddings()
|
|
|
|
chats = [[{
|
|
"role": "user",
|
|
"content": "Please tell me about the capital of France."
|
|
}],
|
|
[{
|
|
"role": "user",
|
|
"content": "When is the day longest during the year?"
|
|
}],
|
|
[{
|
|
"role": "user",
|
|
"content": "Where is bigger, the moon or the sun?"
|
|
}]]
|
|
|
|
prompt_embeds_list = [
|
|
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
|
|
]
|
|
|
|
# Run batch inference with prompt embeddings
|
|
with VllmRunner(
|
|
model_name,
|
|
enable_prompt_embeds=True,
|
|
enforce_eager=True,
|
|
) as vllm_runner:
|
|
outputs = vllm_runner.model.generate([{
|
|
"prompt_embeds": embeds
|
|
} for embeds in prompt_embeds_list])
|
|
|
|
# Verify outputs
|
|
assert len(outputs) == len(chats)
|
|
for i, output in enumerate(outputs):
|
|
assert len(output.outputs) > 0
|
|
assert len(output.outputs[0].text) > 0
|
|
print(f"\nQ{i+1}: {chats[i][0]['content']}")
|
|
print(f"A{i+1}: {output.outputs[0].text}")
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", MODELS)
|
|
def test_prompt_embeds_with_aclgraph(model_name):
|
|
"""Test prompt embeddings with ACL graph enabled vs disabled."""
|
|
# Prepare prompt embeddings
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
embedding_layer = transformers_model.get_input_embeddings()
|
|
|
|
chat = [{"role": "user", "content": "What is the capital of China?"}]
|
|
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
|
|
|
# Run with ACL graph enabled (enforce_eager=False)
|
|
with VllmRunner(
|
|
model_name,
|
|
enable_prompt_embeds=True,
|
|
enforce_eager=False,
|
|
) as vllm_aclgraph_runner:
|
|
aclgraph_outputs = vllm_aclgraph_runner.model.generate({
|
|
"prompt_embeds":
|
|
prompt_embeds,
|
|
})
|
|
|
|
# Run with ACL graph disabled (enforce_eager=True)
|
|
with VllmRunner(
|
|
model_name,
|
|
enable_prompt_embeds=True,
|
|
enforce_eager=True,
|
|
) as vllm_eager_runner:
|
|
eager_outputs = vllm_eager_runner.model.generate({
|
|
"prompt_embeds":
|
|
prompt_embeds,
|
|
})
|
|
|
|
# Verify both produce valid outputs
|
|
assert len(aclgraph_outputs) == 1
|
|
assert len(eager_outputs) == 1
|
|
assert len(aclgraph_outputs[0].outputs[0].text) > 0
|
|
assert len(eager_outputs[0].outputs[0].text) > 0
|
|
|
|
print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
|
|
print("[Eager Output]:", eager_outputs[0].outputs[0].text)
|
|
|
|
# Note: Outputs may differ slightly due to different execution paths,
|
|
# but both should be valid responses
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", MODELS)
|
|
def test_mixed_prompt_embeds_and_text(model_name):
|
|
"""Test mixed inputs with both prompt embeddings and text prompts."""
|
|
# Prepare prompt embeddings for first request
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
embedding_layer = transformers_model.get_input_embeddings()
|
|
|
|
chat = [{"role": "user", "content": "What is AI?"}]
|
|
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
|
|
|
# Prepare text prompt for second request
|
|
text_prompt = "What is machine learning?"
|
|
|
|
# Run inference with mixed inputs
|
|
with VllmRunner(
|
|
model_name,
|
|
enable_prompt_embeds=True,
|
|
enforce_eager=True,
|
|
) as vllm_runner:
|
|
# Test prompt embeddings
|
|
embeds_output = vllm_runner.model.generate({
|
|
"prompt_embeds":
|
|
prompt_embeds,
|
|
})
|
|
|
|
# Test text prompt
|
|
text_output = vllm_runner.model.generate(text_prompt)
|
|
|
|
# Verify both types of inputs work
|
|
assert len(embeds_output) == 1
|
|
assert len(text_output) == 1
|
|
assert len(embeds_output[0].outputs[0].text) > 0
|
|
assert len(text_output[0].outputs[0].text) > 0
|
|
|
|
print("\n[Prompt Embeds Output]:", embeds_output[0].outputs[0].text)
|
|
print("[Text Prompt Output]:", text_output[0].outputs[0].text)
|